In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import cross_val_predict, KFold

In [2]:
df = pd.read_csv('../dataset/data.csv')
data = df.select_dtypes(include=['float64', 'int64'])
targets = ["Turbidity", "DO", "Chl-a"]
data.columns

Index(['Turbidity', 'DO', 'Chl-a', 'Discharge', 'Height', 'Temperature', 'B1',
       'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12',
       'WVP', 'MNDWI', 'GNDVI', 'SDDI', 'NDTI', 'BR', 'NDWI', 'NDPI', 'NDCI',
       '2BDA_Chl', 'RR'],
      dtype='object')

In [3]:
model_dirs = {
    "rf": "../models/rf/cross",
    "mlp": "../models/mlp/cross",
    "rfmlp": "../models/rfmlp",
    "rfmlp": "../models/rfmlp",
}

In [4]:
models = {}
for model_name, folder in model_dirs.items():
    if os.path.exists(folder):
        for file in os.listdir(folder):
            if file.endswith(".pkl"):
                bo = "bo" if "bo" in file else ""
                target = file.replace(".pkl", "").replace("bo", "").strip("_")   
                path = os.path.join(folder, file)
                with open(path, 'rb') as f:
                    models[f"{bo}{model_name}_{target}"] = pickle.load(f)
    else:
        print(f"Folder not found: {folder}")

print(f"Loaded {len(models)} models.")
print("Available models:", list(models.keys()))

Loaded 12 models.
Available models: ['rf_Chl-a', 'rf_DO', 'rf_Turbidity', 'mlp_Chl-a', 'mlp_DO', 'mlp_Turbidity', 'borfmlp_Chl-a', 'borfmlp_DO', 'borfmlp_Turbidity', 'rfmlp_Chl-a', 'rfmlp_DO', 'rfmlp_Turbidity']


In [5]:
results = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    target = next((t for t in targets if t in name), None)
    if target is None:
        continue

    X = data.drop(targets, axis=1)
    y = data[target]
    
    y_pred = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)

    mae = np.around(mean_absolute_error(y, y_pred), 2)
    rmse = np.around(root_mean_squared_error(y, y_pred), 2)
    r2 = np.around(r2_score(y, y_pred) * 100, 2)
    mbe = np.around(np.mean(y_pred - y), 2)

    results.append({
        "Model": name,
        "Target": target,
        "MAE": mae,
        "RMSE": rmse,
        "R²": r2,
        "MBE": mbe
    })

metrics_df = pd.DataFrame(results).sort_values(by=["Target", "R²"], ascending=[False, False]).reset_index(drop=True)
metrics_df

Unnamed: 0,Model,Target,MAE,RMSE,R²,MBE
0,borfmlp_Turbidity,Turbidity,5.68,7.7,95.21,-0.71
1,rfmlp_Turbidity,Turbidity,5.9,8.11,94.7,-0.87
2,mlp_Turbidity,Turbidity,6.61,9.25,93.09,0.29
3,rf_Turbidity,Turbidity,9.07,12.69,86.99,0.33
4,borfmlp_DO,DO,0.43,0.59,91.65,-0.01
5,rfmlp_DO,DO,0.45,0.61,90.94,0.0
6,rf_DO,DO,0.45,0.64,90.22,0.01
7,mlp_DO,DO,1.2,1.67,33.38,0.06
8,borfmlp_Chl-a,Chl-a,1.46,1.93,82.57,-0.17
9,rfmlp_Chl-a,Chl-a,1.46,1.94,82.56,-0.23


In [6]:
metrics_df.to_csv("../dataset/predictions/metrics.csv", index=False)