In [1]:
from appgeopy import *
from my_packages import *

In [2]:
# Load the primary dataset from a compressed pickle file.
# This file contains the raw time-series data for all stations.
# df = pd.read_pickle(
#     r"D:\1000_SCRIPTS\003_Project002\20250222_GTWR001\3_MGTWR\2_Test_Run001\20250416_GWR_InputData_CUMDISP_MLCW_InSAR.xz"
# )

df = pd.read_pickle(r"20250724_GWR_InputData_MLCW_InSAR.xz")

# Engineer a time-based feature: create a column representing the month of each reading.
df["monthly"] = df["time"].dt.to_period("M")

# Calculate the number of months elapsed since the very first measurement.
# This creates a simple integer index for the time periods (0, 1, 2, ...).
df["monthly"] = df["monthly"].sub(df["monthly"].iloc[0]).apply(lambda x: x.n)

# Get a list of all unique station names present in the dataset.
unique_stations = df["STATION"].unique()

df

Unnamed: 0,time,STATION,X_TWD97,Y_TWD97,Layer_1,Layer_2,Layer_3,Layer_4,CUMDISP,monthly
0,2016-04-01,ANHE,179539.204623,2.602035e+06,-33.0,-88.0,-13.0,-2.0,0.000000,0
1,2016-04-01,BEICHEN,178859.958807,2.608229e+06,-8.0,-32.0,-20.0,-8.0,0.000000,0
2,2016-04-01,CANLIN,173088.151033,2.608157e+06,-3.0,-45.0,-56.0,-16.0,0.000000,0
3,2016-04-01,DONGGUANG,175783.144962,2.616755e+06,-23.0,-77.0,-93.0,-3.0,0.000000,0
4,2016-04-01,ERLUN,190429.148778,2.629865e+06,-19.0,-23.0,-22.0,-4.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...
2161,2021-12-01,TUKU,187772.134694,2.620611e+06,-38.0,-203.0,-377.0,-34.0,-259.472772,68
2162,2021-12-01,XINSHENG,188342.160622,2.648279e+06,-19.0,-103.0,-94.0,-125.0,-164.040167,68
2163,2021-12-01,XIUTAN,183652.118876,2.617397e+06,-182.0,-187.0,-182.0,-6.0,-359.210283,68
2164,2021-12-01,XIZHOU,199069.972355,2.638501e+06,-22.0,-37.0,-56.0,-21.0,-180.072782,68


In [3]:
# ==============================================================================
# 2. MAIN PROCESSING LOOP
# This section iterates through each data layer to process and save its data.
# ==============================================================================

# --- Outer Loop: Iterate through each data layer (e.g., Layer 1 to 4) ---
for lay_num in trange(1, 5, desc="Processing Layers"):

    # Initialize an empty DataFrame to store the processed data for the current layer.
    final_mlcw_input = pd.DataFrame()
    print(f"Processing Layer {lay_num}")

    # --- Inner Loop: Iterate through each unique station ---
    for select_station in unique_stations:

        # Define the column name for the current layer being processed.
        current_layer = f"Layer_{lay_num}"

        # Filter the main DataFrame to get the data for only the current station.
        df_byStation = df.query("STATION==@select_station")

        # smooth the cumulative displacement with windows = 3
        # it means taking 3 points, including the current point (so we have 2 neighbors)
        # df_byStation["CUMDISP_ma3"] = smoothing.simple_moving_average(
        #     df_byStation["CUMDISP"], window_size=3
        # )

        # --- Data Quality Checks ---
        # Define conditions to identify and skip stations with unreliable data.
        # Condition 1: The station has too few data points (less than 67).
        cond1 = len(df_byStation) < 67
        # Condition 2: All measurements for the current layer are missing (NaN).
        cond2 = df_byStation[current_layer].isna().all()
        # Condition 3: All measurements for the current layer are zero.
        cond3 = df_byStation[current_layer].mean() == 0

        # If any of the above conditions are true, skip this station and move to the next.
        if cond1 or cond2 or cond3:
            print(f"Skip {select_station}")
            continue

        # --- Data Normalization ---
        # Identify all columns that contain measurement data.
        measurement_cols = [col for col in df.columns if "Layer" in col] + [
            "CUMDISP"
        ]
        # Isolate the two columns we want to normalize for this run.
        select_measure_cols = [current_layer, "CUMDISP"]

        # Separate the station's informational data (like coordinates, time)
        # from its measurement data.
        info_df_byStation = df_byStation.loc[
            :, df_byStation.columns.difference(measurement_cols)
        ]
        measurement_df_byStation = df_byStation.loc[:, select_measure_cols]

        # **CRITICAL STEP: Normalize the measurements.**
        # Subtract the first measurement (iloc[0]) from all subsequent measurements.
        # This makes the time series start at 0 and represent the change over time.

        # measurement_df_byStation = measurement_df_byStation.subtract(
        #     measurement_df_byStation.iloc[0, :], axis=1
        # )
        measurement_df_byStation[current_layer] = (
            measurement_df_byStation[current_layer]
            - measurement_df_byStation[current_layer].iloc[0]
        )

        # --- Recombine and Finalize Data for the Station ---
        # Concatenate the informational columns and the newly normalized measurement columns.
        temp = pd.concat([info_df_byStation, measurement_df_byStation], axis=1)

        # Select a fixed window of time steps (from the 2nd to the 67th).
        # The first row is skipped because it's now all zeros after normalization.
        temp = temp.iloc[1:68, :]

        # Append the cleaned, normalized data for this station to the layer's final DataFrame.
        final_mlcw_input = pd.concat(
            [final_mlcw_input, temp], axis=0, ignore_index=True
        )

    # --- Save the Processed Data for the Layer ---
    # After processing all stations, save the combined DataFrame to a CSV file.
    today_string = datetime.today().strftime("%Y%m%d")
    output_filename = (
        f"{today_string}_GTWR_InputData_MLCW_InSAR_{current_layer}.csv"
    )
    print(output_filename)
    final_mlcw_input.to_csv(output_filename, index=False)
    print(f"Saved data for {current_layer} to {output_filename}\n")

Processing Layers:   0%|          | 0/4 [00:00<?, ?it/s]

Processing Layer 1
Skip ANHE
Skip TANQIFENXIAO
Skip XINPI
20250724_GTWR_InputData_MLCW_InSAR_Layer_1.csv
Saved data for Layer_1 to 20250724_GTWR_InputData_MLCW_InSAR_Layer_1.csv

Processing Layer 2
Skip ANHE
Skip TANQIFENXIAO
Skip XINPI
20250724_GTWR_InputData_MLCW_InSAR_Layer_2.csv
Saved data for Layer_2 to 20250724_GTWR_InputData_MLCW_InSAR_Layer_2.csv

Processing Layer 3
Skip ANHE
Skip JIANYANG
Skip TANQIFENXIAO
Skip XINPI
20250724_GTWR_InputData_MLCW_InSAR_Layer_3.csv
Saved data for Layer_3 to 20250724_GTWR_InputData_MLCW_InSAR_Layer_3.csv

Processing Layer 4
Skip ANHE
Skip HAIFENG
Skip JIANYANG
Skip JIAXING
Skip TANQIFENXIAO
Skip XINPI
Skip XINXING
20250724_GTWR_InputData_MLCW_InSAR_Layer_4.csv
Saved data for Layer_4 to 20250724_GTWR_InputData_MLCW_InSAR_Layer_4.csv



#### Produce `All_Layer` column

In [4]:
sub = df.copy()
all_layer_arr = df.loc[:, [f"Layer_{lay_num}" for lay_num in range(1, 5)]].sum(
    axis=1
)
sub.insert(loc=len(sub.columns) - 2, column="All_Layer", value=all_layer_arr)
sub.head(5)

Unnamed: 0,time,STATION,X_TWD97,Y_TWD97,Layer_1,Layer_2,Layer_3,Layer_4,All_Layer,CUMDISP,monthly
0,2016-04-01,ANHE,179539.204623,2602035.0,-33.0,-88.0,-13.0,-2.0,-136.0,0.0,0
1,2016-04-01,BEICHEN,178859.958807,2608229.0,-8.0,-32.0,-20.0,-8.0,-68.0,0.0,0
2,2016-04-01,CANLIN,173088.151033,2608157.0,-3.0,-45.0,-56.0,-16.0,-120.0,0.0,0
3,2016-04-01,DONGGUANG,175783.144962,2616755.0,-23.0,-77.0,-93.0,-3.0,-196.0,0.0,0
4,2016-04-01,ERLUN,190429.148778,2629865.0,-19.0,-23.0,-22.0,-4.0,-68.0,0.0,0


In [6]:
# Initialize an empty DataFrame to store the processed data for the current layer.
final_mlcw_input = pd.DataFrame()

# --- Inner Loop: Iterate through each unique station ---
for select_station in unique_stations:

    # Define the column name for the current layer being processed.
    current_layer = f"All_Layer"

    # Filter the main DataFrame to get the data for only the current station.
    df_byStation = sub.query("STATION==@select_station")

    # smooth the cumulative displacement with windows = 3
    # it means taking 3 points, including the current point (so we have 2 neighbors)
    # df_byStation["CUMDISP_ma3"] = smoothing.simple_moving_average(
    #     df_byStation["CUMDISP"], window_size=3
    # )

    # --- Data Quality Checks ---
    # Define conditions to identify and skip stations with unreliable data.
    # Condition 1: The station has too few data points (less than 67).
    cond1 = len(df_byStation) < 67
    # Condition 2: All measurements for the current layer are missing (NaN).
    cond2 = df_byStation[current_layer].isna().all()
    # Condition 3: All measurements for the current layer are zero.
    cond3 = df_byStation[current_layer].mean() == 0

    # If any of the above conditions are true, skip this station and move to the next.
    if cond1 or cond2 or cond3:
        print(f"Skip {select_station}")
        continue

    # --- Data Normalization ---
    # Identify all columns that contain measurement data.
    measurement_cols = [col for col in df.columns if "Layer" in col] + [
        "All_Layer",
        "CUMDISP",
        # "CUMDISP_ma3",
    ]
    # Isolate the two columns we want to normalize for this run.
    select_measure_cols = [current_layer, "CUMDISP"]  # , "CUMDISP_ma3"]

    # Separate the station's informational data (like coordinates, time)
    # from its measurement data.
    info_df_byStation = df_byStation.loc[
        :, df_byStation.columns.difference(measurement_cols)
    ]
    measurement_df_byStation = df_byStation.loc[:, select_measure_cols]

    # **CRITICAL STEP: Normalize the measurements.**
    # Subtract the first measurement (iloc[0]) from all subsequent measurements.
    # This makes the time series start at 0 and represent the change over time.
    measurement_df_byStation = measurement_df_byStation.subtract(
        measurement_df_byStation.iloc[0, :], axis=1
    )

    # --- Recombine and Finalize Data for the Station ---
    # Concatenate the informational columns and the newly normalized measurement columns.
    temp = pd.concat([info_df_byStation, measurement_df_byStation], axis=1)

    # Select a fixed window of time steps (from the 2nd to the 67th).
    # The first row is skipped because it's now all zeros after normalization.
    temp = temp.iloc[1:67, :]

    # Append the cleaned, normalized data for this station to the layer's final DataFrame.
    final_mlcw_input = pd.concat(
        [final_mlcw_input, temp], axis=0, ignore_index=True
    )

# --- Save the Processed Data for the Layer ---
# After processing all stations, save the combined DataFrame to a CSV file.
today_string = datetime.today().strftime("%Y%m%d")
output_filename = (
    f"{today_string}_GTWR_InputData_MLCW_InSAR_{current_layer}.csv"
)
print(output_filename)
final_mlcw_input.to_csv(output_filename, index=False)
print(f"Saved data for {current_layer} to {output_filename}\n")

Skip ANHE
Skip TANQIFENXIAO
Skip XINPI
20250724_GTWR_InputData_MLCW_InSAR_All_Layer.csv
Saved data for All_Layer to 20250724_GTWR_InputData_MLCW_InSAR_All_Layer.csv

