In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils_io import load_step, save_step
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso

y_train = load_step("y_train_nz")
y_test = load_step("y_test_nz")
X_train_scaled = load_step("X_train_nz_scaled")
X_test_scaled = load_step("X_test_nz_scaled")


Lasso Model Tree, no zeros same fit

In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import Lasso
import numpy as np

class LassoModelTree(BaseEstimator, RegressorMixin):
    """
    Very simple model tree:
      1. Train a regression tree to split the feature space.
      2. In each leaf, fit a separate Lasso regression.
    """

    def __init__(self,
                 max_depth=None,
                 min_samples_leaf=20,
                 tree_random_state=42,
                 lasso_alpha=0.01,
                 lasso_max_iter=10000):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.tree_random_state = tree_random_state
        self.lasso_alpha = lasso_alpha
        self.lasso_max_iter = lasso_max_iter

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y).ravel()

        # 1) Fit a regression tree for the splits
        self.tree_ = DecisionTreeRegressor(
            criterion="squared_error",
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.tree_random_state,
        )
        self.tree_.fit(X, y)

        # 2) For each leaf, fit a Lasso model on the samples in that leaf
        leaf_ids = self.tree_.apply(X)
        self.leaf_models_ = {}

        for leaf in np.unique(leaf_ids):
            mask = leaf_ids == leaf
            X_leaf = X[mask]
            y_leaf = y[mask]

            model = Lasso(alpha=self.lasso_alpha, max_iter=self.lasso_max_iter)
            model.fit(X_leaf, y_leaf)
            self.leaf_models_[leaf] = model

        return self

    def predict(self, X):
        X = np.asarray(X)
        leaf_ids = self.tree_.apply(X)
        y_pred = np.empty(X.shape[0])

        for leaf in np.unique(leaf_ids):
            mask = leaf_ids == leaf
            model = self.leaf_models_[leaf]
            y_pred[mask] = model.predict(X[mask])

        return y_pred


In [12]:
# ensure y is 1D
y_train_arr = np.ravel(y_train)
y_test_arr = np.ravel(y_test)

mt_lasso = LassoModelTree(
    max_depth=None,        # tune as needed
    min_samples_leaf=200,
    lasso_alpha=0.1,
)

mt_lasso.fit(X_train_scaled, y_train_arr)
y_pred_mt_lasso = mt_lasso.predict(X_test_scaled)

mse = mean_squared_error(y_test_arr, y_pred_mt_lasso)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_arr, y_pred_mt_lasso)

print("Lasso Model Tree")
print(f"MSE :  {mse:.4f}")
print(f"RMSE:  {rmse:.4f}")
print(f"R^2 :  {r2:.4f}")

ValueError: Number of labels=63097 does not match number of samples=70612