In [None]:
import pickle

from appgeopy import *
from my_packages import *

In [None]:
def apply_nscore(
    df, idx_column="STATION", transformation_columns=None
):
    """
    Applies normal score transformation to specified columns in the input DataFrame.

    Parameters:
      df (pd.DataFrame): Input data frame.
      idx_column (str): Name of the unique identifier column (default: "STATION").
      transformation_columns (list of str): List of column names to transform.
          If None, all columns except the idx_column are used.

    Returns:
      pd.DataFrame: A new DataFrame containing the original identifier and the new
          transformed columns, each prefixed with 'Trans_'.

    Structural Rules for the Input DataFrame:
      1. Must include a unique identifier column (default name: "STATION") that uniquely
         identifies each record.
      2. Must contain one or more numeric columns that are to be transformed.
      3. Any additional columns (meta-data) are preserved as is.
      4. If transformation_columns is not provided, all columns except the idx_column will
         be considered for transformation.
    """
    # Validate that the required identifier column exists
    if idx_column not in df.columns:
        raise ValueError(
            f"Input DataFrame must contain the '{idx_column}' column as a unique identifier."
        )

    # Determine columns to transform if not explicitly provided
    if transformation_columns is None:
        transformation_columns = [
            col for col in df.columns if col != idx_column
        ]

    # Create a copy to work on; use idx_column as index for mapping transformation results
    output_df = df.loc[
        :, ~df.columns.isin(transformation_columns)
    ].copy()
    output_df.set_index(idx_column, inplace=True)

    # --------------------------------------------------------------------------
    # Significant change: Refactored loop to process each transformation column generically
    # --------------------------------------------------------------------------
    for col in tqdm(transformation_columns):
        try:
            # Create a temporary DataFrame for the current column
            temp = df[[idx_column, col]].dropna(subset=[col]).copy()

            # Apply normal score transformation
            # The new transformed column is prefixed with 'Trans_'
            transformed_col = "Trans_" + col
            temp[transformed_col], tvDisp, tnsDisp = geostats.nscore(
                temp, col
            )

            # Filter out outliers outside the acceptable range [-3, 3]
            filter_cond = (temp[transformed_col] >= -3) & (
                temp[transformed_col] <= 3
            )
            temp = temp[filter_cond]
            temp.set_index(idx_column, inplace=True)

            # Map the transformed values back to the output DataFrame using the identifier
            output_df[transformed_col] = output_df.index.map(
                temp[transformed_col]
            )
        except Exception as e:
            print(f"Error processing column '{col}': {e}")
            continue

    # Reset index so that the identifier becomes a column again
    output_df.reset_index(inplace=True)
    return output_df

In [None]:
def apply_logtransform(
    df, idx_column="STATION", transformation_columns=None, c=1
):
    """
    Apply a log transform to selected DataFrame columns.

    Adds a constant 'c' to avoid log(0) and creates new columns
    prefixed with 'Trans_' for each transformed column. By default,
    all columns except the identifier (idx_column) are transformed.

    Parameters:
        df (pd.DataFrame): Input data.
        idx_column (str): Unique identifier column name (default "STATION").
        transformation_columns (list): Columns to transform. If None, all
                                       columns except idx_column are used.
        c (float): Constant added to avoid log(0) (default 1).

    Returns:
        pd.DataFrame: DataFrame with the original identifier and new transformed columns.

    Raises:
        ValueError: If the identifier column is missing.
    """
    # Check that the identifier column exists.
    if idx_column not in df.columns:
        raise ValueError(
            f"Input DataFrame must contain '{idx_column}'."
        )

    # If no columns specified, use all except the identifier.
    if transformation_columns is None:
        transformation_columns = [
            col for col in df.columns if col != idx_column
        ]

    # Create output DataFrame with identifier as index.
    output_df = df.loc[
        :, ~df.columns.isin(transformation_columns)
    ].copy()
    output_df.set_index(idx_column, inplace=True)

    # Process each column to be transformed.
    for col in transformation_columns:
        try:
            # Extract and clean data for the column.
            temp = df[[idx_column, col]].dropna(subset=[col]).copy()
            transformed_col = "Trans_" + col
            # Apply the log transformation.
            temp[transformed_col] = np.log(df[col] + c)
            temp.set_index(idx_column, inplace=True)
            # Map the transformed values back to the output DataFrame.
            output_df[transformed_col] = output_df.index.map(
                temp[transformed_col]
            )
        except Exception as e:
            print(f"Error processing column '{col}': {e}")
            continue

    output_df.reset_index(inplace=True)
    return output_df

In [None]:
from sklearn.preprocessing import (
    QuantileTransformer,
)


def quantile_normal_transform(
    data, n_quantiles=1000, subsample=10_000, random_state=42
):
    """
    Applies a quantile transformation that maps the data to a normal distribution.

    Parameters:
        data (array-like): Input rainfall data.
        n_quantiles (int): Number of quantiles to use (should be less than or equal to len(data)).
        random_state (int): Seed for reproducibility.

    Returns:
        numpy.ndarray: Transformed data that follows a standard normal distribution.
    """
    data = np.array(data).reshape(-1, 1)
    qt = QuantileTransformer(
        n_quantiles=n_quantiles,
        subsample=subsample,
        output_distribution="normal",
        random_state=random_state,
    )
    transformed = qt.fit_transform(data)
    return transformed.flatten()

#### Monthly Groundwater

transform data using nscore transformation (geostatspy), then save the transformed dataframe into CSV file, because Python 2.7 cannot read higher level of file.

#### Monthly Rainfall

transform data using log transformation, because data only contains positive values

In [None]:
fpath = "1_Input_DataSets/Monthly_Rainfall_CRFP.xz"
basename = os.path.basename(fpath)
df = pd.read_pickle(fpath)
# exclude C1K250
df = df.query("STATION!='C1K250'")
trans_cols = [col for col in df.columns if col.startswith("N")]

output_df = df.loc[:, ~df.columns.isin(trans_cols)].copy()

for col in trans_cols:
    new_col = "Trans_" + col
    output_df[new_col] = quantile_normal_transform(
        data=df.loc[:, col],
        n_quantiles=int(len(df)/2),
        subsample=len(df),
        random_state=42,
    )

# output_df.to_csv(f"2_Transformed/QuantileNormTrans_{basename.replace(".xz", ".csv")}", index=False)
with open(f"2_Transformed/QuantileNormTrans_{basename}", "wb") as f:
    pickle.dump(output_df.to_dict(), f, protocol=2)

#### Monthly Electricity

#### Monthly dU

In [None]:
fpath = "1_Input_DataSets/Monthly_DISPLACEMENT_dU_CRFP.xz"
basename = os.path.basename(fpath).split(".")[0]
df = pd.read_pickle(fpath)
trans_cols = [col for col in df.columns if col.startswith("N")]
transformed_df = apply_nscore(
    df=df, idx_column="PointKey", transformation_columns=trans_cols
)
# transformed_df.to_csv(f"2_Transformed/NSCORE_{basename.replace(".xz", ".csv")}", index=False)

# with open(f"2_Transformed/NSCORE_{basename}", "wb") as f:
#     pickle.dump(transformed_df.to_dict(), f, protocol=2)

number_of_chunks = 15

# Split the output DataFrame into 10 equally sized chunks.
chunks = np.array_split(transformed_df, number_of_chunks)

for i, chunk in enumerate(chunks):
    out_path = f"2_Transformed/NSCORE_{basename}_{str(i+1).zfill(3)}.xz"
    with open(out_path, "wb") as f:
        pickle.dump(chunk.to_dict(), f, protocol=2)