### This note shows how to use sklearn to set up a regression for p(z)

Do the usual imports

In [None]:
import tables_io
import numpy as np
import matplotlib.pyplot as plt
from rail.raruma import plotting_functions as raruma_plot
from rail.raruma import utility_functions as raruma_util

In [None]:
from rail.estimation.algos.k_nearneigh import KNearNeighInformer, KNearNeighEstimator
from rail.core.data import Hdf5Handle, DataStore, DATA_STORE
from rail.utils.catalog_utils import RomanRubinCatalogConfig
DS = DATA_STORE()
DataStore.allow_overwrite = True
RomanRubinCatalogConfig.apply(RomanRubinCatalogConfig.tag)

Change this to be the root of the current PZ working area

In [None]:
pz_dir = '/global/cfs/cdirs/lsst/groups/PZ/DP1'

Load the test/ train data

In [None]:
train = tables_io.read(f"{pz_dir}/data/train/dp1_ecdfs_matched_specgold_train.hdf5")
test = tables_io.read(f"{pz_dir}/data/test/dp1_ecdfs_matched_specgold_test.hdf5")
#d.keys()
#train = tables_io.sliceObj(d, slice(0, -1, 10))
#test = tables_io.sliceObj(d, slice(1, -1, 10))

In [None]:
train_targets, train_features = raruma_util.prepare_data_total_mag_and_colors(train, '{band}_gaap1p0Mag', 'ugrizy')
test_targets, test_features = raruma_util.prepare_data_total_mag_and_colors(test, '{band}_gaap1p0Mag', 'ugrizy')

In [None]:
_ = raruma_plot.plot_true_nz(train_targets)

Do PCA for kicks

In [None]:
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [None]:
scaler = preprocessing.StandardScaler().fit(train_features)
pca = PCA(n_components=train_features.shape[-1], whiten=False)

In [None]:
scaled_train = scaler.transform(train_features).clip(-5, 5)
pca_train = pca.fit(scaled_train)

In [None]:
scaled_test = scaler.transform(test_features).clip(-5, 5)
pca_test = pca.fit(scaled_test)

Do all the regression

In [None]:
from sklearn.ensemble import (HistGradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import HuberRegressor, LinearRegression, QuantileRegressor
from sklearn.svm import NuSVR
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor

In [None]:
hbr = HistGradientBoostingRegressor()
etr = ExtraTreesRegressor()
abr = AdaBoostRegressor()
gpr = GaussianProcessRegressor()
isr = IsotonicRegression()
hur = HuberRegressor()
lir = LinearRegression()
qur = QuantileRegressor()
nsr = NuSVR()
knr = KNeighborsRegressor()
rnr = RadiusNeighborsRegressor()

In [None]:
def run_it(reg):
    preds = raruma_util.run_regression(reg, scaled_train, train_targets, scaled_test)
    _ = raruma_plot.plot_true_predict_fancy(test_targets, np.nan_to_num(preds, -0.4))
    _ = raruma_plot.plot_biweight_stats_v_redshift(test_targets, preds)

In [None]:
run_it(knr)