In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj statsmodels

In [23]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [32]:
# Assuming the CSV file is named "data.csv" and is located in the same directory as the notebook
df = pd.read_csv("../../data/processed_data/wells_data_gambia_for_machine_learning.csv")
# Mapping each unique string in 'DepthToGroundwater' to an integer
unique_values = df['DepthToGroundwater'].unique()
value_to_int = {value: idx for idx, value in enumerate(unique_values)}
# Apply the mapping to the column
df['DepthToGroundwater'] = df['DepthToGroundwater'].map(value_to_int)

# Your existing code for train-test split
unique_ids = df['ID'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)
train_df = df[df['ID'].isin(train_ids)]
test_df = df[df['ID'].isin(test_ids)]

X_train = train_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_train = train_df['GROUNDWATER_LEVEL']
X_test = test_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_test = test_df['GROUNDWATER_LEVEL']

# Prepara los grupos para GroupKFold
groups = df['ID']  # Asegúrate de que 'ID' sea la columna que identifica a cada pozo

def create_param_grid():
    # Combinaciones base
    base_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'ccp_alpha': [0.0, 0.01, 0.1],
    }

    # Combinaciones para bootstrap=True
    bootstrap_true_grid = {**base_param_grid, 'bootstrap': [True], 'max_samples': [0.8, None]}

    # Combinaciones para bootstrap=False
    bootstrap_false_grid = {**base_param_grid, 'bootstrap': [False], 'max_samples': [None]}

    # Unimos las dos listas de combinaciones
    return [bootstrap_true_grid, bootstrap_false_grid]

# Configura el GridSearchCV con GroupKFold
gkf = GroupKFold(n_splits=5)

param_grid = create_param_grid()

rf = RandomForestRegressor(random_state=42, oob_score=True)  # Considera ajustar oob_score si es necesario

# Ignora las advertencias de OOB si decides mantener oob_score=True
warnings.filterwarnings('ignore', message='Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.')

# Configuración de GridSearchCV con verbosidad reducida
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=gkf,
    n_jobs=-1,
    verbose=1,  # Reducir el nivel de verbosidad
    scoring='neg_mean_squared_error',
    error_score=np.nan  # Establecer aquí para manejar errores
)

# Separa las características y la etiqueta, sin dividir aún en conjuntos de entrenamiento y prueba
X = df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y = df['GROUNDWATER_LEVEL']

# Ajusta el modelo (aquí también podrías capturar las advertencias)
with warnings.catch_warnings():
    warnings.simplefilter('ignore', category=UserWarning)
    warnings.filterwarnings('ignore', message='Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.')
    # Llamada a GridSearchCV o ajuste del RandomForestRegressor
    grid_search.fit(X, y, groups=groups)

# Los mejores parámetros y el mejor modelo se pueden obtener de grid_search como antes
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Fitting 5 folds for each of 1458 candidates, totalling 7290 fits
Best parameters: {'bootstrap': True, 'ccp_alpha': 0.01, 'max_depth': 10, 'max_features': 'sqrt', 'max_samples': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [33]:
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.metrics import make_scorer

# Configuración del modelo con los mejores parámetros encontrados
best_rf = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42,
    oob_score=True
)

# Preparar el GroupKFold para la validación cruzada
gkf = GroupKFold(n_splits=5)

# Preparar el scorer basado en MSE para que funcione con cross_val_score
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Ejecutar la validación cruzada por grupos
scores = cross_val_score(best_rf, X, y, groups=groups, cv=gkf, scoring=mse_scorer)

# Calcular la media y el desvío estándar de los scores
mean_score = scores.mean()
std_score = scores.std()

print(f"Mean MSE from group cross-validation: {mean_score}")
print(f"Standard Deviation of MSE from group cross-validation: {std_score}")

# Para MAE y R², puedes definir otros scorers y repetir el proceso
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mae_scores = cross_val_score(best_rf, X, y, groups=groups, cv=gkf, scoring=mae_scorer)
r2_scores = cross_val_score(best_rf, X, y, groups=groups, cv=gkf, scoring=r2_scorer)

print(f"Mean MAE from group cross-validation: {-mae_scores.mean()}")  # Note el signo negativo para corregir el 'greater_is_better=False'
print(f"Mean R² from group cross-validation: {r2_scores.mean()}")


Mean MSE from group cross-validation: -23.685039556594653
Standard Deviation of MSE from group cross-validation: 14.20276084828574
Mean MAE from group cross-validation: 3.4927295611538804
Mean R² from group cross-validation: 0.011781222591570351
