In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import polars as pl

df_train = pl.read_csv("../data/raw/pond_train.csv")
df_test = pl.read_csv("../data/raw/pond_test.csv")

X = df_train.select("project_a", "project_b", "total_amount_usd", "funder", "quarter")
y = df_train.select("weight_a")

In [None]:
import numpy as np
from category_encoders import TargetEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_val_score,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline(
    [
        (
            "target_encoder",
            TargetEncoder(cols=["project_a", "project_b", "funder", "quarter"]),
        ),
        ("scaler", StandardScaler()),
        ("classifier", HistGradientBoostingRegressor()),
    ]
)

# Define the outer and inner cross-validation strategies
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "classifier__learning_rate": [0.01, 0.05, 0.1],
    "classifier__max_depth": [3, 5, 7],
    "classifier__min_samples_leaf": [1, 2, 4],
}

# Perform nested cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=inner_cv,
    scoring="neg_mean_absolute_error",
)

nested_scores = cross_val_score(
    grid_search,
    X.to_pandas(),
    y.to_pandas(),
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
)
print(f"Nested CV Accuracy: {np.mean(nested_scores):.4f} ± {np.std(nested_scores):.4f}")