In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj statsmodels

In [2]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV, cross_val_score

In [3]:
# Carga de datos y preparación inicial (igual que tu código)
df = pd.read_csv("../../data/processed_data/wells_data_gambia_for_machine_learning.csv")
unique_values = df['DepthToGroundwater'].unique()
value_to_int = {value: idx for idx, value in enumerate(unique_values)}
df['DepthToGroundwater'] = df['DepthToGroundwater'].map(value_to_int)

# Preparación de los conjuntos de datos (igual que tu código)
unique_ids = df['ID'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)
train_df = df[df['ID'].isin(train_ids)]
test_df = df[df['ID'].isin(test_ids)]

X_train = train_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_train = train_df['GROUNDWATER_LEVEL']
X_test = test_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_test = test_df['GROUNDWATER_LEVEL']

# Prepara los grupos para GroupKFold (igual que tu código)
groups = df['ID']

def create_gbm_param_grid():
    """
    Crea una cuadrícula de parámetros para el GradientBoostingRegressor.
    """
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 1.0]  # Fracción de muestras para ajustar los aprendices individuales
    }
    return param_grid

In [5]:
# Configuración de GridSearchCV con GroupKFold para GBM
gkf = GroupKFold(n_splits=5)
param_grid_gbm = create_gbm_param_grid()

gbm = GradientBoostingRegressor(random_state=42)

grid_search_gbm = GridSearchCV(
    estimator=gbm,
    param_grid=param_grid_gbm,
    cv=gkf,
    n_jobs=-1,
    verbose=1,
    scoring='neg_mean_squared_error',
    error_score=np.nan
)

# Asegúrate de que 'groups' solo contenga los IDs correspondientes a 'X_train'
groups_train = train_df['ID']

# Ahora puedes hacer el ajuste con GridSearchCV
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    grid_search_gbm.fit(X=X_train, y=y_train, groups=groups_train)


# Mejores parámetros y modelo
best_params_gbm = grid_search_gbm.best_params_
print("Best parameters for GBM:", best_params_gbm)

# Configuración del mejor modelo GBM
best_gbm = GradientBoostingRegressor(
    n_estimators=best_params_gbm['n_estimators'],
    learning_rate=best_params_gbm['learning_rate'],
    max_depth=best_params_gbm['max_depth'],
    min_samples_split=best_params_gbm['min_samples_split'],
    min_samples_leaf=best_params_gbm['min_samples_leaf'],
    subsample=best_params_gbm['subsample'],
    random_state=42
)

# Validación cruzada para GBM
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# Asegúrate de que 'groups_train' solo contenga los IDs correspondientes a 'X_train'
groups_train = train_df['ID']

# Utiliza 'groups_train' en lugar de 'groups' en cross_val_score
mse_scores = cross_val_score(best_gbm, X_train, y_train, groups=groups_train, cv=gkf, scoring=mse_scorer)
mae_scores = cross_val_score(best_gbm, X_train, y_train, groups=groups_train, cv=gkf, scoring=mae_scorer)
r2_scores = cross_val_score(best_gbm, X_train, y_train, groups=groups_train, cv=gkf, scoring=r2_scorer)

# Imprime los resultados
print(f"Mean MSE from group cross-validation: {-mse_scores.mean()}")
print(f"Mean MAE from group cross-validation: {-mae_scores.mean()}")
print(f"Mean R² from group cross-validation: {r2_scores.mean()}")


Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Best parameters for GBM: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 1.0}
Mean MSE from group cross-validation: 46.1089520520666
Mean MAE from group cross-validation: 4.998856982979899
Mean R² from group cross-validation: -0.4049811386436691


In [6]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

# Asumiendo que 'best_gbm' es tu modelo ajustado

# Importancia de características
feature_importance = best_gbm.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Importancia de Características')

# Curvas de aprendizaje
train_sizes, train_scores, test_scores = learning_curve(
    best_gbm, X_train, y_train, cv=gkf, n_jobs=-1, 
    train_sizes=np.linspace(.1, 1.0, 5)
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure()
plt.title("Curvas de Aprendizaje")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")
plt.legend(loc="best")


NotFittedError: This GradientBoostingRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.