In [2]:
import pandas as pd

water = pd.read_csv('data/water_quality_training_dataset.csv')
landsat = pd.read_csv('data/landsat_features_training.csv')
terra = pd.read_csv('data/terraclimate_features_training.csv')
climate = pd.read_csv('data/train_climate_data.csv')

print(f"water: {water.shape}")
print(f"landsat: {landsat.shape}")
print(f"terra: {terra.shape}")
print(f"climate: {climate.shape}")

water: (9319, 6)
landsat: (9319, 9)
terra: (9319, 4)
climate: (9319, 10)


In [3]:
df = pd.concat([
    water,
    landsat.drop(columns=['Latitude', 'Longitude', 'Sample Date']),
    terra.drop(columns=['Latitude', 'Longitude', 'Sample Date']),
    climate
], axis=1)

print(f"Shape final: {df.shape}")
df.head()

Shape final: (9319, 23)


Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus,nir,green,swir16,swir22,...,precip_30d,precip_mean,precip_max,precip_days,temp_mean,temp_max,temp_min,temp_range,et0_mean,et0_sum
0,-28.760833,17.730278,02-01-2011,128.912,555.0,10.0,11190.0,11426.0,7687.5,7645.0,...,2.2,0.070968,1.0,0,25.758065,43.7,14.3,29.4,7.882581,244.36
1,-26.861111,28.884722,03-01-2011,74.72,162.9,163.0,17658.5,9550.0,13746.5,10574.0,...,206.6,6.664516,36.9,21,19.335484,28.7,10.8,17.9,4.76,147.56
2,-26.45,28.085833,03-01-2011,89.254,573.0,80.0,15210.0,10720.0,17974.0,14201.0,...,254.5,8.209677,61.7,23,20.280645,30.2,12.1,18.1,4.883548,151.39
3,-27.671111,27.236944,03-01-2011,82.0,203.6,101.0,14887.0,10943.0,13522.0,11403.0,...,192.9,6.222581,53.5,20,21.822581,32.5,12.7,19.8,5.701613,176.75
4,-27.356667,27.286389,03-01-2011,56.1,145.1,151.0,16828.5,9502.5,12665.5,9643.0,...,199.8,6.445161,53.5,20,21.912903,32.5,12.4,20.1,5.674194,175.9


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
import xgboost as xgb
import numpy as np

targets = ['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']

# Extraer componentes temporales de Sample Date
df['Sample Date'] = pd.to_datetime(df['Sample Date'], dayfirst=True)
df['year'] = df['Sample Date'].dt.year
df['month_sin'] = np.sin(2 * np.pi * df['Sample Date'].dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df['Sample Date'].dt.month / 12)
df['day_sin'] = np.sin(2 * np.pi * df['Sample Date'].dt.day / 31)
df['day_cos'] = np.cos(2 * np.pi * df['Sample Date'].dt.day / 31)

# Estacion del año (hemisferio sur)
# Verano: Dic, Ene, Feb | Otoño: Mar, Abr, May | Invierno: Jun, Jul, Ago | Primavera: Sep, Oct, Nov
month = df['Sample Date'].dt.month
season_map = {12: 0, 1: 0, 2: 0,   # Verano
              3: 1, 4: 1, 5: 1,     # Otoño
              6: 2, 7: 2, 8: 2,     # Invierno
              9: 3, 10: 3, 11: 3}   # Primavera
season = month.map(season_map)
df['season_sin'] = np.sin(2 * np.pi * season / 4)
df['season_cos'] = np.cos(2 * np.pi * season / 4)

# Features = Lat, Lon, fecha ciclica + estacion + satelitales/climaticas
drop_cols = targets + ['Sample Date']
X = df.drop(columns=drop_cols)

imputer = SimpleImputer(strategy='median')
X_imp = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print(f"Features ({len(X_imp.columns)}): {list(X_imp.columns)}")

# Grillas: 3 valores por parametro
rf_param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 0.5, 0.8],
}

xgb_param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [4, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
}

rf_scores = {}
xgb_scores = {}
rf_best_params = {}
xgb_best_params = {}

for target in targets:
    y = df[target]
    print(f"\n{'='*60}")
    print(f"Target: {target}")
    print(f"{'='*60}")

    # Random Forest GridSearch
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
    rf_grid.fit(X_imp, y)
    rf_scores[target] = rf_grid.best_score_
    rf_best_params[target] = rf_grid.best_params_
    print(f"[RF]  Mejor R2 CV: {rf_grid.best_score_:.4f}")
    print(f"[RF]  Params: {rf_grid.best_params_}")

    # XGBoost GridSearch
    xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
    xgb_grid = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
    xgb_grid.fit(X_imp, y)
    xgb_scores[target] = xgb_grid.best_score_
    xgb_best_params[target] = xgb_grid.best_params_
    print(f"[XGB] Mejor R2 CV: {xgb_grid.best_score_:.4f}")
    print(f"[XGB] Params: {xgb_grid.best_params_}")

rf_mean = np.mean(list(rf_scores.values()))
xgb_mean = np.mean(list(xgb_scores.values()))

print(f"\n{'='*60}")
print("RESULTADO FINAL")
print("=" * 60)
for target in targets:
    print(f"{target}:")
    print(f"  RF  R2 = {rf_scores[target]:.4f}  |  XGB R2 = {xgb_scores[target]:.4f}")
print()
print(f"Random Forest  - R2 promedio: {rf_mean:.4f}")
print(f"XGBoost        - R2 promedio: {xgb_mean:.4f}")

Features (26): ['Latitude', 'Longitude', 'nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI', 'pet', 'precip_30d', 'precip_mean', 'precip_max', 'precip_days', 'temp_mean', 'temp_max', 'temp_min', 'temp_range', 'et0_mean', 'et0_sum', 'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'season_sin', 'season_cos']

Target: Total Alkalinity
Fitting 5 folds for each of 243 candidates, totalling 1215 fits




[RF]  Mejor R2 CV: 0.7073
[RF]  Params: {'max_depth': 30, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Fitting 5 folds for each of 243 candidates, totalling 1215 fits




[XGB] Mejor R2 CV: 0.7213
[XGB] Params: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.8}

Target: Electrical Conductance
Fitting 5 folds for each of 243 candidates, totalling 1215 fits




[RF]  Mejor R2 CV: 0.7648
[RF]  Params: {'max_depth': 30, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Fitting 5 folds for each of 243 candidates, totalling 1215 fits




[XGB] Mejor R2 CV: 0.7717
[XGB] Params: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.8}

Target: Dissolved Reactive Phosphorus
Fitting 5 folds for each of 243 candidates, totalling 1215 fits


