In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl requests xarray pyproj statsmodels

In [2]:
%pip install -q tensorflow

Note: you may need to restart the kernel to use updated packages.


In [3]:
import tensorflow as tf

: 

In [None]:
from tensorflow import keras

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

In [3]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

: 

In [3]:
# Assuming the CSV file is named "data.csv" and is located in the same directory as the notebook
df = pd.read_csv("../../data/processed_data/wells_data_gambia_for_machine_learning.csv")
# Mapping each unique string in 'DepthToGroundwater' to an integer
unique_values = df['DepthToGroundwater'].unique()
value_to_int = {value: idx for idx, value in enumerate(unique_values)}
# Apply the mapping to the column
df['DepthToGroundwater'] = df['DepthToGroundwater'].map(value_to_int)

In [None]:
# Your existing code for train-test split
unique_ids = df['ID'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)
train_df = df[df['ID'].isin(train_ids)]
test_df = df[df['ID'].isin(test_ids)]

X_train = train_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_train = train_df['GROUNDWATER_LEVEL']
X_test = test_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_test = test_df['GROUNDWATER_LEVEL']

# Crear una función que construya el modelo de Keras
def build_model(n_layers=1, n_neurons=30, activation='relu', optimizer='adam'):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=X_train.shape[1:]))
    for _ in range(n_layers):
        model.add(keras.layers.Dense(n_neurons, activation=activation))
        # Considera agregar Dropout o regularización aquí si es necesario
    model.add(keras.layers.Dense(1))  # Salida para regresión
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

# Envolver el modelo de Keras con KerasRegressor
keras_reg = KerasRegressor(build_fn=build_model)

# Definir la cuadrícula de búsqueda (esto es solo un ejemplo)
param_distribs = {
    'n_layers': [1, 2, 3],
    'n_neurons': np.arange(1, 100),
    'activation': ['relu', 'tanh'],
    'optimizer': ['adam', 'sgd'],
}

# Preparar GroupKFold para la validación cruzada
groups = df['ID'].values  # Asegúrate de que 'ID' sea la columna que identifica a cada pozo
gkf = GroupKFold(n_splits=5)

# Usar RandomizedSearchCV para buscar los hiperparámetros
rnd_search_cv = RandomizedSearchCV(keras_reg, param_distribs, n_iter=10, cv=gkf, scoring='neg_mean_squared_error')

# Ajustar RandomizedSearchCV
X = df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date']).values
y = df['GROUNDWATER_LEVEL'].values

rnd_search_cv.fit(X, y, groups=groups, epochs=100,
                  validation_split=0.1, callbacks=[keras.callbacks.EarlyStopping(patience=10)])

# Los mejores parámetros y el mejor modelo se pueden obtener de rnd_search_cv
print("Best parameters:", rnd_search_cv.best_params_)

# Evaluar el mejor modelo
model = rnd_search_cv.best_estimator_.model
mse_test = mean_squared_error(y_test, model.predict(X_test))
mae_test = mean_absolute_error(y_test, model.predict(X_test))
r2_test = r2_score(y_test, model.predict(X_test))

print(f"Test MSE: {mse_test}")
print(f"Test MAE: {mae_test}")
print(f"Test R^2: {r2_test}")
