In [1]:
from appgeopy import *
from my_packages import *

In [2]:
fpath = r"CRFP_Ebill_2014_2021_agriculture.xz"
df = pd.read_pickle(fpath)
df["Outliers"].unique()

array([0, 1], dtype=int16)

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from scipy import stats
from pykrige.ok import OrdinaryKriging
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
import warnings
import time

# --- Configuration ---
N_KNOWN_POINTS = 10
N_UNKNOWN_POINTS = 100
N_MONTHS = 36
Z_SCORE_THRESHOLD = 3.0  # Threshold for outlier removal
APPLY_LOG_TRANSFORM = False # Set to True to apply log transform, False otherwise
CV_FOLDS = 5 # Number of folds for cross-validation based variogram selection
VARIOGRAM_MODELS_TO_TEST = ['linear', 'power', 'gaussian', 'spherical', 'exponential'] # Models to test
OUTPUT_INTERPOLATED_FILE = 'interpolated_groundwater_levels.csv'
OUTPUT_EVALUATION_FILE = 'kriging_evaluation_metrics.xlsx' # Using Excel for better multi-sheet potential if needed

# Suppress warnings from PyKrige (optional, can be noisy)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning) # PyKrige specific user warnings

# --- 1. Simulate Data (Replace this with loading your actual data) ---
print("Step 1: Simulating data...")
np.random.seed(42) # for reproducibility

# Known points coordinates (e.g., within a 100x100 area)
known_coords = pd.DataFrame({
    'X': np.random.rand(N_KNOWN_POINTS) * 100,
    'Y': np.random.rand(N_KNOWN_POINTS) * 100
})

# Known points measurements (simulate some spatial correlation + noise)
# Base level + trend + monthly variation + noise
base_levels = np.random.rand(N_KNOWN_POINTS) * 10 + 50 # Base groundwater level
spatial_effect = (known_coords['X'] * 0.05 + known_coords['Y'] * 0.03) # Simple spatial trend
known_values_list = []
for month in range(N_MONTHS):
    monthly_variation = np.sin(month * 2 * np.pi / 12) * 2 # Seasonal variation
    noise = np.random.randn(N_KNOWN_POINTS) * 0.5 # Random noise
    month_values = base_levels + spatial_effect + monthly_variation + noise
    # Introduce a couple of potential outliers per month for testing
    if month % 10 == 0:
         outlier_idx = np.random.choice(N_KNOWN_POINTS, 1, replace=False)
         month_values[outlier_idx] *= (1 + np.random.choice([-1,1]) * 0.5) # +/- 50% outlier

    known_values_list.append(month_values)

known_values = pd.DataFrame(np.array(known_values_list).T, columns=[f'Month_{i+1}' for i in range(N_MONTHS)])

# Combine known coords and values
known_data = pd.concat([known_coords, known_values], axis=1)

# Unknown points coordinates
unknown_coords = pd.DataFrame({
    'X': np.random.rand(N_UNKNOWN_POINTS) * 100,
    'Y': np.random.rand(N_UNKNOWN_POINTS) * 100
})

print(f"Simulated {N_KNOWN_POINTS} known points with {N_MONTHS} measurements each.")
print(f"Simulated {N_UNKNOWN_POINTS} unknown points.")
# print("Known data head:\n", known_data.head())
# print("\nUnknown coords head:\n", unknown_coords.head())

# --- 2. Preprocessing and Kriging Loop ---
print("\nStep 2: Starting Preprocessing and Kriging for each month...")
all_interpolated_values = pd.DataFrame(index=unknown_coords.index)
all_evaluation_metrics = []

x_known = known_data['X'].values
y_known = known_data['Y'].values
x_unknown = unknown_coords['X'].values
y_unknown = unknown_coords['Y'].values

start_time = time.time()

for i in range(N_MONTHS):
    month_col = f'Month_{i+1}'
    print(f"\nProcessing {month_col}...")

    # Extract data for the current month
    z_known_original = known_data[month_col].values.copy()
    
    # --- Preprocessing ---
    # a) Outlier Removal (using Z-score)
    z_scores = np.abs(stats.zscore(z_known_original, nan_policy='omit'))
    outlier_indices = np.where(z_scores > Z_SCORE_THRESHOLD)[0]
    z_known_processed = z_known_original.copy()
    if len(outlier_indices) > 0:
        print(f"  Detected {len(outlier_indices)} outliers. Replacing with NaN.")
        z_known_processed[outlier_indices] = np.nan
    else:
        print("  No outliers detected.")
        
    # Handle potential NaNs introduced or already present before transformation/kriging
    valid_indices = ~np.isnan(z_known_processed)
    if np.sum(valid_indices) < 3: # Need at least 3 points for variogram
        print(f"  Skipping {month_col}: Not enough valid data points ({np.sum(valid_indices)}) after outlier removal.")
        all_interpolated_values[month_col] = np.nan
        all_evaluation_metrics.append({
            'Month': month_col, 'Best_Model': 'Skipped', 'CV_RMSE': np.nan,
            'CV_MAE': np.nan, 'Num_Valid_Points': np.sum(valid_indices),
            'Transformation': 'None', 'Outliers_Removed': len(outlier_indices)
        })
        continue
        
    x_known_valid = x_known[valid_indices]
    y_known_valid = y_known[valid_indices]
    z_known_valid = z_known_processed[valid_indices]
    
    # b) Data Transformation (Optional: Log Transform)
    transformation_applied = "None"
    if APPLY_LOG_TRANSFORM:
        # Check for non-positive values before log transform
        if np.any(z_known_valid <= 0):
            print("  Warning: Data contains non-positive values. Cannot apply log transform. Skipping transformation.")
        else:
            print("  Applying Log Transformation.")
            z_known_valid = np.log(z_known_valid)
            transformation_applied = "Log"
            
    # --- Variogram Model Selection via Cross-Validation ---
    print("  Performing Cross-Validation to select best variogram model...")
    best_model = None
    best_rmse = np.inf
    cv_predictions = None # Store predictions from the best model's CV

    kf = KFold(n_splits=min(CV_FOLDS, len(x_known_valid)), shuffle=True, random_state=i) # Ensure folds <= samples

    for model in VARIOGRAM_MODELS_TO_TEST:
        print(f"    Testing model: {model}")
        model_cv_predictions = np.full_like(z_known_valid, fill_value=np.nan, dtype=float)
        
        try:
            for train_idx, val_idx in kf.split(x_known_valid):
                # Check if validation set is empty or too small (can happen with few points)
                if len(val_idx) == 0: continue
                
                # Need enough points for variogram estimation in the training set
                if len(train_idx) < 3: continue 
                
                ok_cv = OrdinaryKriging(
                    x_known_valid[train_idx],
                    y_known_valid[train_idx],
                    z_known_valid[train_idx],
                    variogram_model=model,
                    verbose=False,
                    enable_plotting=False,
                    # nlags=6 # Can adjust nlags if needed
                )
                
                # Predict at validation points
                pred_z, pred_ss = ok_cv.execute(
                    'points',
                    x_known_valid[val_idx],
                    y_known_valid[val_idx]
                )
                model_cv_predictions[val_idx] = pred_z
                
            # Calculate RMSE for this model based on CV predictions
            valid_cv_preds = ~np.isnan(model_cv_predictions)
            if np.sum(valid_cv_preds) > 0:
                current_rmse = np.sqrt(mean_squared_error(z_known_valid[valid_cv_preds], model_cv_predictions[valid_cv_preds]))
                print(f"      CV RMSE: {current_rmse:.4f}")
                if current_rmse < best_rmse:
                    best_rmse = current_rmse
                    best_model = model
                    cv_predictions = model_cv_predictions # Store these predictions for final eval metrics
            else:
                 print(f"      CV failed for model {model} (no valid predictions).")

        except Exception as e:
            print(f"      Error during Kriging/CV for model {model}: {e}")
            continue # Try next model

    if best_model is None:
        print(f"  Skipping {month_col}: Could not find a suitable variogram model via cross-validation.")
        all_interpolated_values[month_col] = np.nan
        all_evaluation_metrics.append({
            'Month': month_col, 'Best_Model': 'Failed', 'CV_RMSE': np.nan,
            'CV_MAE': np.nan, 'Num_Valid_Points': len(z_known_valid),
            'Transformation': transformation_applied, 'Outliers_Removed': len(outlier_indices)
        })
        continue

    print(f"  Best variogram model selected: {best_model} (CV RMSE: {best_rmse:.4f})")

    # --- Kriging Interpolation using the Best Model ---
    print(f"  Performing Kriging interpolation using {best_model} model...")
    try:
        ok_final = OrdinaryKriging(
            x_known_valid,
            y_known_valid,
            z_known_valid,
            variogram_model=best_model,
            verbose=False,
            enable_plotting=False,
            # nlags=6
        )

        # Execute on unknown points
        z_pred, z_var = ok_final.execute('points', x_unknown, y_unknown)

        # --- Back Transformation (if applied) ---
        if transformation_applied == "Log":
            print("  Applying inverse transformation (Exp).")
            z_pred = np.exp(z_pred)
            # Note: Variance also needs back-transformation, but it's more complex.
            # We usually report metrics on the original scale, so back-transform predictions first.

        # Store interpolated values
        all_interpolated_values[month_col] = z_pred
        print(f"  Interpolation for {month_col} complete.")

        # --- Evaluation ---
        # Use the stored cross-validation predictions from the *best* model
        valid_cv_indices = ~np.isnan(cv_predictions)
        if np.sum(valid_cv_indices) > 0:
             # Back-transform CV predictions if needed *before* calculating metrics against original scale data
            cv_preds_eval_scale = cv_predictions[valid_cv_indices]
            z_known_eval_scale = z_known_processed[valid_indices][valid_cv_indices] # Use processed (outliers NaNed) data on original scale
            
            if transformation_applied == "Log":
                 cv_preds_eval_scale = np.exp(cv_preds_eval_scale)
                 # z_known_eval_scale is already on the original scale (before log)
                 
            cv_mae = mean_absolute_error(z_known_eval_scale, cv_preds_eval_scale)
            # We already have best_rmse calculated on the transformed scale if applicable.
            # Let's recalculate RMSE on the original scale for consistency in reporting.
            cv_rmse_original_scale = np.sqrt(mean_squared_error(z_known_eval_scale, cv_preds_eval_scale))
            
            print(f"  CV Metrics (Original Scale): RMSE={cv_rmse_original_scale:.4f}, MAE={cv_mae:.4f}")

            metrics = {
                'Month': month_col,
                'Best_Model': best_model,
                'CV_RMSE': cv_rmse_original_scale, # Use original scale RMSE
                'CV_MAE': cv_mae,
                'Num_Valid_Points': len(z_known_valid),
                'Transformation': transformation_applied,
                'Outliers_Removed': len(outlier_indices)
            }
        else:
             print("  Could not calculate CV metrics (no valid CV predictions).")
             metrics = {
                 'Month': month_col, 'Best_Model': best_model, 'CV_RMSE': np.nan,
                 'CV_MAE': np.nan, 'Num_Valid_Points': len(z_known_valid),
                 'Transformation': transformation_applied, 'Outliers_Removed': len(outlier_indices)
             }

        all_evaluation_metrics.append(metrics)

    except Exception as e:
        print(f"  Error during final Kriging interpolation for {month_col}: {e}")
        all_interpolated_values[month_col] = np.nan
        # Add error entry to metrics
        all_evaluation_metrics.append({
            'Month': month_col, 'Best_Model': best_model if best_model else 'Error', 'CV_RMSE': np.nan,
            'CV_MAE': np.nan, 'Num_Valid_Points': len(z_known_valid),
            'Transformation': transformation_applied, 'Outliers_Removed': len(outlier_indices),
             'Error': str(e)
        })

end_time = time.time()
print(f"\nProcessing finished in {end_time - start_time:.2f} seconds.")

# --- 3. Save Results ---
print("\nStep 3: Saving results...")

# Combine unknown coordinates with interpolated values
final_interpolated_data = pd.concat([unknown_coords, all_interpolated_values], axis=1)

# Save interpolated values
try:
    final_interpolated_data.to_csv(OUTPUT_INTERPOLATED_FILE, index=False)
    print(f"Interpolated values saved to '{OUTPUT_INTERPOLATED_FILE}'")
except Exception as e:
    print(f"Error saving interpolated values: {e}")

# Save evaluation metrics
try:
    metrics_df = pd.DataFrame(all_evaluation_metrics)
    metrics_df.to_excel(OUTPUT_EVALUATION_FILE, index=False, sheet_name='Monthly_Metrics')
    print(f"Evaluation metrics saved to '{OUTPUT_EVALUATION_FILE}'")
except Exception as e:
    print(f"Error saving evaluation metrics: {e}")

print("\nProgram finished.")