In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj statsmodels

In [2]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import Nystroem
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV, cross_val_score

In [22]:
# Carga de datos y preparación inicial
df = pd.read_csv("../../data/processed_data/wells_data_gambia_for_machine_learning.csv")
unique_values = df['DepthToGroundwater'].unique()
value_to_int = {value: idx for idx, value in enumerate(unique_values)}
df['DepthToGroundwater'] = df['DepthToGroundwater'].map(value_to_int)

# Print basic dataset info
print("Total rows in dataset:", len(df))
print('')

# Preparación de los conjuntos de datos
unique_ids = df['ID'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=2)
train_df = df[df['ID'].isin(train_ids)]
test_df = df[df['ID'].isin(test_ids)]

# Print train-test split details
print("Unique IDs in the dataset:", len(unique_ids))
print("Unique IDs in training set:", len(train_ids))
print("Unique IDs in testing set:", len(test_ids))
print('')
print("Rows in training set:", len(train_df))
print("Rows in testing set:", len(test_df))

X_train = train_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_train = train_df['GROUNDWATER_LEVEL']
X_test = test_df.drop(columns=['GROUNDWATER_LEVEL', 'ID', 'Date'])
y_test = test_df['GROUNDWATER_LEVEL']

# Print shape of train and test sets
print('')
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print('')
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

# Prepara los grupos para GroupKFold
groups = train_df['ID']


Total rows in dataset: 6636

Unique IDs in the dataset: 36
Unique IDs in training set: 28
Unique IDs in testing set: 8

Rows in training set: 5574
Rows in testing set: 1062

Shape of X_train: (5574, 19)
Shape of y_train: (5574,)

Shape of X_test: (1062, 19)
Shape of y_test: (1062,)


In [23]:
# Define GroupKFold
gkf = GroupKFold(n_splits=5)

# Initialize AdaBoostRegressor with a DecisionTreeRegressor as base estimator
ada_regressor = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(max_depth=3),
    n_estimators=50, 
    learning_rate=1.0, 
    loss='linear', 
    random_state=42
)

# Update the pipeline with AdaBoostRegressor
pipeline = Pipeline([
    ('nystroem', Nystroem(random_state=1)),
    ('adaboost', ada_regressor)
])

# Define the parameter grid for AdaBoost
param_grid = {
    'nystroem__kernel': ['rbf', 'poly', 'sigmoid'],
    'nystroem__gamma': [0.1, 0.5, 1],
    'nystroem__n_components': [100, 200, 300],
    'adaboost__n_estimators': [30, 50, 70],
    'adaboost__learning_rate': [0.01, 0.1, 1]
}

# Grid search with GroupKFold
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=gkf,
    n_jobs=-1,
    verbose=1
)

# Fit the model with groups for the initial grid search
grid_search.fit(X=X_train, y=y_train, groups=groups)

# Best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


Best parameters: {'adaboost__learning_rate': 1, 'adaboost__n_estimators': 30, 'nystroem__gamma': 0.1, 'nystroem__kernel': 'sigmoid', 'nystroem__n_components': 300}


In [24]:
# Extract and adjust parameter names from the initial grid search
best_nystroem_params = {
    'kernel': best_params['nystroem__kernel'],
    'gamma': best_params['nystroem__gamma'],
    'n_components': best_params['nystroem__n_components']
}

best_adaboost_params = {
    'n_estimators': best_params['adaboost__n_estimators'],
    'learning_rate': best_params['adaboost__learning_rate']
}

# Define new, more focused parameter grid for fine-tuning
param_grid_fine = {
    'nystroem__gamma': [best_nystroem_params['gamma'] * factor for factor in [0.8, 1, 1.2]],  # Adjusted here
    'adaboost__n_estimators': [int(best_adaboost_params['n_estimators'] * factor) for factor in [0.8, 1, 1.2]],
    'adaboost__learning_rate': [best_adaboost_params['learning_rate'] * factor for factor in [0.8, 1, 1.2]]
}

# Initialize AdaBoostRegressor with fine-tuned parameters
ada_regressor_fine = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(max_depth=3, random_state=42),
    n_estimators=best_adaboost_params['n_estimators'], 
    learning_rate=best_adaboost_params['learning_rate'], 
    loss='linear', 
    random_state=42
)

# Update the pipeline with fine-tuned AdaBoostRegressor
pipeline_fine = Pipeline([
    ('scaler', StandardScaler()),
    ('nystroem', Nystroem(**best_nystroem_params, random_state=1)),
    ('adaboost', ada_regressor_fine)
])

# New Grid search for fine-tuning
grid_search_fine = GridSearchCV(
    estimator=pipeline_fine,
    param_grid=param_grid_fine,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit the model with groups for the fine-tuning grid search
grid_search_fine.fit(X=X_train, y=y_train, groups=groups)

# Extract new best parameters
new_best_params = grid_search_fine.best_params_
print("New best parameters:", new_best_params)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
New best parameters: {'adaboost__learning_rate': 1, 'adaboost__n_estimators': 36, 'nystroem__gamma': 0.12}


Best parameters: {'adaboost__learning_rate': 1, 'adaboost__n_estimators': 30, 'nystroem__gamma': 0.1, 'nystroem__kernel': 'sigmoid', 'nystroem__n_components': 300}


In [29]:
# Using the best parameters from your grid searches
final_nystroem = Nystroem(
    kernel=best_params['nystroem__kernel'],  # From best parameters
    gamma=new_best_params['nystroem__gamma'],  # From new best parameters
    n_components=best_params['nystroem__n_components'],  # From best parameters
    random_state=1
)

# Initialize AdaBoostRegressor with the updated parameters
final_adaboost = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(max_depth=16, random_state=42),
    n_estimators=new_best_params['adaboost__n_estimators'],  # From new best parameters
    learning_rate=new_best_params['adaboost__learning_rate'],  # From new best parameters
    random_state=42
)

# Set up the final pipeline with the selected parameters
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('nystroem', final_nystroem),
    ('adaboost', final_adaboost)  # Using AdaBoostRegressor here
])

# Train the model on the entire training dataset
final_pipeline.fit(X=X_train, y=y_train)

# Optionally, evaluate the model on the test dataset
test_score = final_pipeline.score(X=X_test, y=y_test)
print(f"Test Score: {test_score}")

# Use the model to make predictions
predictions = final_pipeline.predict(X=X_test)

# Calculate MSE, MAE, and R²
mse = mean_squared_error(y_true=y_test, y_pred=predictions)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions)
r2 = r2_score(y_true=y_test, y_pred=predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Coefficient of Determination (R²): {r2}")


Test Score: -0.050409773620879283
Mean Squared Error (MSE): 42.0076913832957
Mean Absolute Error (MAE): 5.671501079593692
Coefficient of Determination (R²): -0.050409773620879283


With StandarScaler:
Test Score: 0.1612956772646813
Mean Squared Error (MSE): 12.934439422717016
Mean Absolute Error (MAE): 2.1590231197897602
Coefficient of Determination (R²): 0.1612956772646813