In [83]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
DATA = {
    "lung":    dict(X="data/lung_features.npy",    y="data/lung_target.npy"),
    "bladder": dict(X="data/bladder_features.npy", y="data/bladder_target.npy"),
}

In [85]:
def flatten(arr):
    n, t, p = arr.shape
    return arr.reshape(n, t * p)

In [None]:
def dt_grid_baseline(name, *, seed=42):
    X = flatten(np.load(DATA[name]["X"]))
    y = np.load(DATA[name]["y"])

    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=seed)

    pipe = Pipeline([
        ("const", VarianceThreshold()),
        ("tree",  DecisionTreeRegressor(random_state=seed)),
    ])

    param_grid = {
        "tree__max_depth":        [None, 5, 10, 20, 30],
        "tree__min_samples_split": [2, 5, 10],
        "tree__min_samples_leaf":  [1, 2, 4],
    }

    gs = GridSearchCV(
        pipe,
        param_grid,
        scoring="neg_mean_squared_error",
        cv=3,
        n_jobs=-1,
    ).fit(Xtr, ytr)

    pred = gs.predict(Xte)

    return dict(
        MAE = mean_absolute_error(yte, pred),
        MSE = mean_squared_error(yte, pred),
        R2  = r2_score(yte, pred),
        best= gs.best_params_,
    )

In [None]:
for ds in ("lung", "bladder"):
    s = dt_grid_baseline(ds)
    print(f"{ds.upper():8}  MAE={s['MAE']:.6f}  "
            f"MSE={s['MSE']:.8f}  R²={s['R2']:+.3f}  "
            f"Params={s['best']}")

LUNG      MAE=0.000771  MSE=0.00000180  R²=-0.118  Params={'tree__max_depth': 5, 'tree__min_samples_leaf': 4, 'tree__min_samples_split': 2}
BLADDER   MAE=0.000196  MSE=0.00000012  R²=+0.539  Params={'tree__max_depth': 5, 'tree__min_samples_leaf': 4, 'tree__min_samples_split': 2}
