In [1]:
import pickle

from appgeopy import *
from my_packages import *

In [2]:
def apply_nscore(df, idx_column="STATION", transformation_columns=None):
    """
    Applies normal score transformation to specified columns in the input DataFrame.

    Parameters:
      df (pd.DataFrame): Input data frame.
      idx_column (str): Name of the unique identifier column (default: "STATION").
      transformation_columns (list of str): List of column names to transform.
          If None, all columns except the idx_column are used.

    Returns:
      pd.DataFrame: A new DataFrame containing the original identifier and the new
          transformed columns, each prefixed with 'Trans_'.

    Structural Rules for the Input DataFrame:
      1. Must include a unique identifier column (default name: "STATION") that uniquely
         identifies each record.
      2. Must contain one or more numeric columns that are to be transformed.
      3. Any additional columns (meta-data) are preserved as is.
      4. If transformation_columns is not provided, all columns except the idx_column will
         be considered for transformation.
    """
    # Validate that the required identifier column exists
    if idx_column not in df.columns:
        raise ValueError(
            f"Input DataFrame must contain the '{idx_column}' column as a unique identifier."
        )

    # Determine columns to transform if not explicitly provided
    if transformation_columns is None:
        transformation_columns = [
            col for col in df.columns if col != idx_column
        ]

    # Create a copy to work on; use idx_column as index for mapping transformation results
    output_df = df.loc[:, ~df.columns.isin(transformation_columns)].copy()
    output_df.set_index(idx_column, inplace=True)

    # --------------------------------------------------------------------------
    # Significant change: Refactored loop to process each transformation column generically
    # --------------------------------------------------------------------------
    for col in tqdm(transformation_columns):
        try:
            # Create a temporary DataFrame for the current column
            temp = df[[idx_column, col]].dropna(subset=[col]).copy()

            # Apply normal score transformation
            # The new transformed column is prefixed with 'Trans_'
            transformed_col = "Trans_" + col
            temp[transformed_col], tvDisp, tnsDisp = geostats.nscore(temp, col)

            # Filter out outliers outside the acceptable range [-3, 3]
            filter_cond = (temp[transformed_col] >= -3) & (
                temp[transformed_col] <= 3
            )
            temp = temp[filter_cond]
            temp.set_index(idx_column, inplace=True)

            # Map the transformed values back to the output DataFrame using the identifier
            output_df[transformed_col] = output_df.index.map(
                temp[transformed_col]
            )
        except Exception as e:
            print(f"Error processing column '{col}': {e}")
            continue

    # Reset index so that the identifier becomes a column again
    output_df.reset_index(inplace=True)
    return output_df

#### Monthly dU

In [5]:
fpath = "CORRECTED_Monthly_DISPLACEMENT_CRFP_saveqgis_Oct2025.xz"
basename = os.path.basename(fpath).split(".")[0]
df = pd.read_pickle(fpath)
df = df.reset_index(drop=False)
trans_cols = [col for col in df.columns if col.startswith("N")]
transformed_df = apply_nscore(
    df=df, idx_column="PointKey", transformation_columns=trans_cols
)
# transformed_df.to_csv(f"2_Transformed/NSCORE_{basename.replace(".xz", ".csv")}", index=False)

# with open(f"2_Transformed/NSCORE_{basename}", "wb") as f:
#     pickle.dump(transformed_df.to_dict(), f, protocol=2)

  0%|          | 0/112 [00:00<?, ?it/s]

In [6]:
number_of_chunks = 30

# Split the output DataFrame into 10 equally sized chunks.
chunks = np.array_split(transformed_df, number_of_chunks)

for i, chunk in enumerate(chunks):
    out_path = f"2_Transformed/NSCORE_{basename}_{str(i+1).zfill(3)}.xz"
    with open(out_path, "wb") as f:
        pickle.dump(chunk.to_dict(), f, protocol=2)