In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj statsmodels

In [2]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import Nystroem
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV, cross_val_score

In [None]:
# Carga de datos y preparación inicial (igual que tu código)
df = pd.read_csv("../../data/processed_data/wells_data_gambia_for_machine_learning.csv")
unique_values = df['DepthToGroundwater'].unique()
value_to_int = {value: idx for idx, value in enumerate(unique_values)}
df['DepthToGroundwater'] = df['DepthToGroundwater'].map(value_to_int)

# Preparación de los conjuntos de datos (igual que tu código)
unique_ids = df['ID'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)
train_df = df[df['ID'].isin(train_ids)]
test_df = df[df['ID'].isin(test_ids)]

X_train = train_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_train = train_df['GROUNDWATER_LEVEL']
X_test = test_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_test = test_df['GROUNDWATER_LEVEL']

# Prepara los grupos para GroupKFold (igual que tu código)
groups = train_df['ID']

In [4]:
# Define GroupKFold
gkf = GroupKFold(n_splits=5)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('nystroem', Nystroem(random_state=1)),
    ('svr', LinearSVR(max_iter=10000, random_state=42))
])

# Define the parameter grid
param_grid = {
    'nystroem__kernel': ['rbf', 'poly', 'sigmoid'],
    'nystroem__gamma': [0.1, 0.5, 1],
    'nystroem__n_components': [100, 200, 300],
    'svr__C': [0.1, 1, 10]
}

# Grid search with GroupKFold
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=gkf,
    n_jobs=-1,
    verbose=1
)

# Fit the model with groups for the initial grid search
grid_search.fit(X=X_train, y=y_train, groups=groups)

# Best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Fitting 5 folds for each of 81 candidates, totalling 405 fits




Best parameters: {'nystroem__gamma': 0.1, 'nystroem__kernel': 'sigmoid', 'nystroem__n_components': 300, 'svr__C': 1}


In [6]:
# Apply best parameters
nystroem = Nystroem(
    kernel=best_params['nystroem__kernel'],
    gamma=best_params['nystroem__gamma'],
    n_components=best_params['nystroem__n_components'],
    random_state=1
)

linear_svr = LinearSVR(
    C=best_params['svr__C'],
    max_iter=10000,
    random_state=42
)

# Define a new, more focused parameter grid for fine-tuning
param_grid_fine = {
    'nystroem__gamma': [best_params['nystroem__gamma'] * factor for factor in [0.8, 1, 1.2]],
    'svr__C': [best_params['svr__C'] * factor for factor in [0.8, 1, 1.2]]
}

# Update the pipeline with the best parameters
best_model.set_params(**best_params)

# New Grid search for fine-tuning
grid_search_fine = GridSearchCV(
    estimator=best_model,  # Use the best model from the initial grid search
    param_grid=param_grid_fine,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit the model with groups for the fine-tuning grid search
grid_search_fine.fit(X=X_train, y=y_train, groups=groups)

# Extract new best parameters
new_best_params = grid_search_fine.best_params_
print("New best parameters:", new_best_params)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




New best parameters: {'nystroem__gamma': 0.08000000000000002, 'svr__C': 0.8}


In [7]:
# Using the best parameters from your grid searches
final_nystroem = Nystroem(
    kernel='sigmoid',  # From best parameters
    gamma=0.08,  # From new best parameters
    n_components=100,  # From best parameters
    random_state=1
)

final_linear_svr = LinearSVR(
    C=0.12,  # From new best parameters
    max_iter=10000,
    random_state=42
)

# Set up the final pipeline with the selected parameters
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('nystroem', final_nystroem),
    ('svr', final_linear_svr)
])

# Train the model on the entire training dataset
final_pipeline.fit(X_train, y_train)

# Optionally, evaluate the model on the test dataset
test_score = final_pipeline.score(X_test, y_test)
print(f"Test Score: {test_score}")

# Use the model to make predictions
predictions = final_pipeline.predict(X_test)

# Calculate MSE, MAE, and R²
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Coefficient of Determination (R²): {r2}")


Test Score: 0.3462585938553989
Mean Squared Error (MSE): 10.081954255728439
Mean Absolute Error (MAE): 2.3653571234813087
Coefficient of Determination (R²): 0.3462585938553989


