# Regression test

Test the two-stage ridge regressor on a regular regression dataset.

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, BayesianRidge
from sklearn.kernel_approximation import Nystroem
from sklearn.compose import ColumnTransformer

from twostageridge import TwoStageRidge

## Load the data - Boston housing

In [2]:
data = load_boston()
X, y = data.data, data.target
y -= y.mean()
y /= y.std()
D = X.shape[1]

## Fit linear models

In [3]:
model = make_pipeline(
    StandardScaler(),
    TwoStageRidge(treatment_index=0)
)

gs = GridSearchCV(
    model,
    param_grid={"twostageridge__regulariser2": [1, 1e1, 1e2, 1e3, 1e4]},
    cv=5
)
gs.fit(X, y)
print(f"best score R^2: {gs.best_score_:.4f}")
print(f"best parameters: \n\t{gs.best_params_}")

best score R^2: 0.4659
best parameters: 
	{'twostageridge__regulariser2': 100.0}


In [4]:
# Score the first stage model
score_z = gs.best_estimator_["twostageridge"].score_stage1(X)
print(f"best score R^2: {score_z:.4f}")

best score R^2: -25.7449


In [5]:
model = make_pipeline(
    StandardScaler(),
    Ridge()
)
gs = GridSearchCV(model, param_grid={"ridge__alpha": [1, 1e1, 1e2, 1e3, 1e4]}, cv=5)
gs.fit(X, y)
print(f"best score R^2: {gs.best_score_:.4f}")
print(f"best parameters: \n\t{gs.best_params_}")

best score R^2: 0.4821
best parameters: 
	{'ridge__alpha': 100.0}


## Non-linear controls

This dataset is known to have some non-linear relationships

In [6]:
# BayesianRidge + Nystroem
model = make_pipeline(
    StandardScaler(),
    Nystroem(n_components=30),
    BayesianRidge()
)
gs = GridSearchCV(model, param_grid={"nystroem__gamma": [1e-3, 1e-2, 0.1, 1.0]}, cv=5)
gs.fit(X, y)
print(f"best score R^2: {gs.best_score_:.4f}")
print(f"best parameters: \n\t{gs.best_params_}")

best score R^2: 0.6840
best parameters: 
	{'nystroem__gamma': 0.001}


In [7]:
# TwoStageRidge + Nystroem
# NOTE: This keeps a linear treatment relationship
model = make_pipeline(
    StandardScaler(),
    ColumnTransformer([
        ("treatment", 'passthrough', slice(0, 1)),
        ("controls", Nystroem(n_components=30), slice(1, D))
    ]),
    TwoStageRidge(treatment_index=0)
)

gs = GridSearchCV(
    model,
    param_grid={
        "twostageridge__regulariser2": [1e-3, 1e-2, 0.1, 1, 10],
        "columntransformer__controls__gamma": [1e-3, 1e-2, 0.1, 1.0]
    },
    cv=5
)
gs.fit(X, y)
print(f"best score R^2: {gs.best_score_:.4f}")
print(f"best parameters: \n\t{gs.best_params_}")

best score R^2: 0.6173
best parameters: 
	{'columntransformer__controls__gamma': 0.01, 'twostageridge__regulariser2': 0.001}


In [8]:
# Score the first stage model
score_z =gs.best_estimator_["twostageridge"].score_stage1(gs.best_estimator_[0:2].transform(X))
print(f"best score R^2: {score_z:.4f}")

best score R^2: 0.5045
