In [1]:
"""
This script processes CSV files to calculate the distance and squared difference
between points based on their coordinates and a selected value column.
The results are saved to an Excel file.
"""

# -----------------------------------------
# Import necessary modules
# -----------------------------------------

from itertools import combinations
import pandas as pd
import numpy as np
import os
from glob import glob
from appgeopy import *
from my_packages import *

# -----------------------------------------
# Function definition
# -----------------------------------------

def calculate_distance_and_squared_difference(df, x_col, y_col, value_col):
    """
    Calculate the distance and squared difference between each pair of points in the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the points.
    x_col (str): The column name for the X coordinates.
    y_col (str): The column name for the Y coordinates.
    value_col (str): The column name for the values to calculate squared differences.

    Returns:
    pd.DataFrame: A DataFrame containing pairs of points with their distances and squared differences.
    """
    results = []

    for (index1, row1), (index2, row2) in combinations(df.iterrows(), 2):
        station1, station2 = row1["STATION"], row2["STATION"]
        x1, y1, x2, y2 = row1[x_col], row1[y_col], row2[x_col], row2[y_col]
        value1, value2 = row1[value_col], row2[value_col]

        distance = np.hypot(x2 - x1, y2 - y1)
        square_diff = (
            (value1 - value2) ** 2
            if not (np.isnan(value1) or np.isnan(value2))
            else np.nan
        )

        results.append(
            (station1, station2, value1, value2, distance, square_diff)
        )

    results_df = pd.DataFrame(
        results,
        columns=[
            "Station1",
            "Station2",
            "Value1",
            "Value2",
            "Distance",
            "SquaredDifference",
        ],
    )
    return results_df

# -----------------------------------------
# Define folder paths and create save folder
# -----------------------------------------

topfolder = r"E:\030_CHOUSHUI_2024\000_INSCALDEFO_2_INSSTACKPSI\PROCESS_005\15_CALIBRATION"
savefolder = os.path.join(topfolder, "0__Supplementary")
os.makedirs(savefolder, exist_ok=True)

# -----------------------------------------
# Identify files to process
# -----------------------------------------

file2process = glob(os.path.join(topfolder, "TRANSFORMED*.csv"))

# -----------------------------------------
# Process each file
# -----------------------------------------

for select_file in file2process:
    savename = os.path.basename(select_file).replace("TRANSFORMED", "SUPPORT").replace("csv", "xlsx")
    
    # Save the results to an Excel file
    savepath = os.path.join(savefolder, savename)
    
    # Remove the file if it exists
    if os.path.isfile(savepath):
        os.remove(savepath)
    
    # Read the selected CSV file
    output_records = pd.read_csv(select_file)
    
    # Identify value columns starting with "Trans"
    value_columns = [col for col in output_records.columns if col.startswith("Trans")]
    
    # Process each value column
    for select_col in tqdm(value_columns):
        data = output_records.loc[:, output_records.columns[:3].tolist() + [select_col]].copy()
        
        # Calculate distances and differences
        analysis_df = calculate_distance_and_squared_difference(
            df=data, x_col="X_TWD97", y_col="Y_TWD97", value_col=select_col
        )
        analysis_df = analysis_df.query("SquaredDifference.notna()")
        
        # Identify pairs of points with low distance but high squared difference
        distance_p10 = analysis_df["Distance"].quantile(q=0.1)
        sq_diff_q90 = analysis_df["SquaredDifference"].quantile(q=0.90)
        
        temp = analysis_df.query(
            "Distance<=@distance_p10 & SquaredDifference>=@sq_diff_q90"
        )
        
        # Save the results to an Excel file
        data_io.save_df_to_excel(df_to_save=temp, filepath=savepath, sheet_name=select_col, verbose=False)


100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [00:08<00:00,  6.27it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [00:07<00:00,  7.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [00:08<00:00,  6.14it/s]
