In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj statsmodels

In [2]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import Nystroem
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV, cross_val_score

In [3]:
# Carga de datos y preparación inicial (igual que tu código)
df = pd.read_csv("../../data/processed_data/wells_data_gambia_for_machine_learning.csv")
unique_values = df['DepthToGroundwater'].unique()
value_to_int = {value: idx for idx, value in enumerate(unique_values)}
df['DepthToGroundwater'] = df['DepthToGroundwater'].map(value_to_int)

# Preparación de los conjuntos de datos (igual que tu código)
unique_ids = df['ID'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)
train_df = df[df['ID'].isin(train_ids)]
test_df = df[df['ID'].isin(test_ids)]

X_train = train_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_train = train_df['GROUNDWATER_LEVEL']
X_test = test_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_test = test_df['GROUNDWATER_LEVEL']

# Prepara los grupos para GroupKFold (igual que tu código)
groups = train_df['ID']

In [16]:
# Define GroupKFold
gkf = GroupKFold(n_splits=5)

# Define the pipeline with SGDRegressor
pipeline = Pipeline([
    ('nystroem', Nystroem(random_state=1)),
    ('sgd', SGDRegressor(max_iter=10000, random_state=42))
])

# Define the parameter grid for SGDRegressor
param_grid = {
    'nystroem__kernel': ['rbf', 'poly', 'sigmoid'],
    'nystroem__gamma': [0.1, 0.5, 1],
    'nystroem__n_components': [100, 200, 300],
    'sgd__alpha': [0.0001, 0.001, 0.01],  # Regularization strength
    'sgd__penalty': ['l2', 'l1', 'elasticnet'],  # Type of regularization
    'sgd__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=gkf,
    n_jobs=-1,
    verbose=1
)

# Fit the model with groups for the initial grid search
grid_search.fit(X=X_train, y=y_train, groups=groups)

# Best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Fitting 5 folds for each of 972 candidates, totalling 4860 fits




Best parameters: {'nystroem__gamma': 0.1, 'nystroem__kernel': 'sigmoid', 'nystroem__n_components': 100, 'sgd__alpha': 0.0001, 'sgd__learning_rate': 'optimal', 'sgd__penalty': 'l1'}


In [19]:
# Apply best parameters from the initial grid search
nystroem = Nystroem(
    kernel=best_params['nystroem__kernel'],
    gamma=best_params['nystroem__gamma'],
    n_components=best_params['nystroem__n_components'],
    random_state=1
)

sgd_regressor = SGDRegressor(
    alpha=best_params['sgd__alpha'],  # Adjust according to your initial best_params
    penalty=best_params['sgd__penalty'],  # Adjust according to your initial best_params
    learning_rate=best_params['sgd__learning_rate'],  # Adjust according to your initial best_params
    max_iter=10000,
    random_state=42
)

# Define a more focused parameter grid for fine-tuning
param_grid_fine = {
    'nystroem__gamma': [best_params['nystroem__gamma'] * factor for factor in [0.8, 1, 1.2]],
    'sgd__alpha': [best_params['sgd__alpha'] * factor for factor in [0.8, 1, 1.2]]  # Fine-tune alpha
    # Add more SGD parameters if needed
}

# Update the pipeline with the best parameters for fine-tuning
best_model.set_params(**best_params)

# New Grid search for fine-tuning
grid_search_fine = GridSearchCV(
    estimator=best_model,  # Use the best model from the initial grid search
    param_grid=param_grid_fine,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit the model with groups for the fine-tuning grid search
grid_search_fine.fit(X=X_train, y=y_train, groups=groups)

# Extract new best parameters
new_best_params = grid_search_fine.best_params_
print("New best parameters:", new_best_params)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
New best parameters: {'nystroem__gamma': 0.06400000000000002, 'sgd__alpha': 0.0014399999999999999}


Best parameters: {'nystroem__gamma': 0.1, 'nystroem__kernel': 'sigmoid', 'nystroem__n_components': 100, 'sgd__alpha': 0.0001, 'sgd__learning_rate': 'optimal', 'sgd__penalty': 'l1'}


In [20]:
# Updated best parameters from your grid searches
best_params = {
    'nystroem__kernel': 'sigmoid',
    'nystroem__gamma': 0.06400000000000002,  # Updated gamma value from fine-tuning
    'nystroem__n_components': 100,  # From initial best parameters
    'sgd__alpha': 0.0014399999999999999,  # Updated alpha value from fine-tuning
    'sgd__learning_rate': 'optimal',  # From initial best parameters
    'sgd__penalty': 'l1'  # From initial best parameters
}

# Using the best parameters from your grid searches
final_nystroem = Nystroem(
    kernel=best_params['nystroem__kernel'],
    gamma=best_params['nystroem__gamma'],
    n_components=best_params['nystroem__n_components'],
    random_state=1
)

final_sgd_regressor = SGDRegressor(
    alpha=best_params['sgd__alpha'],
    learning_rate=best_params['sgd__learning_rate'],
    penalty=best_params['sgd__penalty'],
    max_iter=10000,
    random_state=42
)

# Set up the final pipeline with the selected parameters
final_pipeline = Pipeline([
    ('nystroem', final_nystroem),
    ('sgd_regressor', final_sgd_regressor)  # Updated to SGDRegressor
])

# Train the model on the entire training dataset
final_pipeline.fit(X=X_train, y=y_train)

# Optionally, evaluate the model on the test dataset
test_score = final_pipeline.score(X=X_test, y=y_test)
print(f"Test Score: {test_score}")

# Use the model to make predictions
predictions = final_pipeline.predict(X=X_test)

# Calculate MSE, MAE, and R²
mse = mean_squared_error(y_true=y_test, y_pred=predictions)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions)
r2 = r2_score(y_true=y_test, y_pred=predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Coefficient of Determination (R²): {r2}")


Test Score: -0.45615162197368275
Mean Squared Error (MSE): 22.45666849943444
Mean Absolute Error (MAE): 2.5194802727090284
Coefficient of Determination (R²): -0.45615162197368275
