In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj statsmodels

In [2]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV, cross_val_score
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate

# Didnt work to much time (1120 minutes)

In [None]:
# Carga de datos y preparación inicial
df = pd.read_csv("../../data/processed_data/wells_data_gambia_for_machine_learning.csv")
unique_values = df['DepthToGroundwater'].unique()
value_to_int = {value: idx for idx, value in enumerate(unique_values)}
df['DepthToGroundwater'] = df['DepthToGroundwater'].map(value_to_int)

# Preparación de los conjuntos de datos
unique_ids = df['ID'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)
train_df = df[df['ID'].isin(train_ids)]
test_df = df[df['ID'].isin(test_ids)]

X_train = train_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_train = train_df['GROUNDWATER_LEVEL']
X_test = test_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_test = test_df['GROUNDWATER_LEVEL']

# Preparación para GroupKFold
groups_train = train_df['ID']

# Definición de la cuadrícula de parámetros para SVR
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Configuración de GridSearchCV con GroupKFold para SVR
gkf = GroupKFold(n_splits=5)
svr = SVR()

grid_search_svr = GridSearchCV(
    estimator=svr,
    param_grid=param_grid_svr,
    cv=gkf,
    n_jobs=-1,
    verbose=1,
    scoring='neg_mean_squared_error'
)

# Ajuste del modelo SVR
grid_search_svr.fit(X=X_train, y=y_train, groups=groups_train)

# Mejores parámetros y modelo
best_params_svr = grid_search_svr.best_params_
print("Best parameters for SVR:", best_params_svr)

# Configuración del mejor modelo SVR
best_svr = SVR(
    C=best_params_svr['C'],
    epsilon=best_params_svr['epsilon'],
    kernel=best_params_svr['kernel'],
    gamma=best_params_svr['gamma']
)

# Métricas de rendimiento para SVR
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(best_svr, X_train, y_train, groups=groups_train, cv=gkf, scoring=mse_scorer)
mae_scores = cross_val_score(best_svr, X_train, y_train, groups=groups_train, cv=gkf, scoring=mae_scorer)
r2_scores = cross_val_score(best_svr, X_train, y_train, groups=groups_train, cv=gkf, scoring=r2_scorer)

# Imprime los resultados
print(f"Mean MSE from group cross-validation: {-mse_scores.mean()}")
print(f"Mean MAE from group cross-validation: {-mae_scores.mean()}")
print(f"Mean R² from group cross-validation: {r2_scores.mean()}")


In [None]:
# Configura la validación cruzada
cv_results = cross_validate(
    best_svr, 
    X_train, 
    y_train, 
    cv=gkf, 
    scoring={'mse': mse_scorer, 'mae': mae_scorer, 'r2': r2_scorer},
    return_train_score=True,
    groups=groups_train
)

# Imprime los resultados de la validación cruzada
print("Validación Cruzada - SVR")
print("MSE (Train):", -cv_results['train_mse'].mean()) 
print("MSE (Test):", -cv_results['test_mse'].mean())
print("MAE (Train):", -cv_results['train_mae'].mean())
print("MAE (Test):", -cv_results['test_mae'].mean())
print("R² (Train):", cv_results['train_r2'].mean())
print("R² (Test):", cv_results['test_r2'].mean())
