### This notebook shows an example of doing some data preperation and using sklearn to do a regression

The usual imports

In [None]:
import tables_io
import numpy as np
import matplotlib.pyplot as plt
from rail.raruma import plotting_functions as raruma_plot
from rail.raruma import utility_functions as raruma_util


Change this to be the root of the current PZ working area

In [None]:
pz_dir = '/global/cfs/cdirs/lsst/groups/PZ/DP1'
# if that fails you can use this
# pz_dir = '/global/u2/e/echarles/dx'

Read a test file (in this case a Roman / Rubin open universe sim)

In [None]:
d = tables_io.read(f"{pz_dir}/data/sandbox_data/roman_rubin_9925.hdf5")

In [None]:
d.keys()

Split it in half into training and test sets

In [None]:
train = tables_io.sliceObj(d, slice(0, -1, 2))
test = tables_io.sliceObj(d, slice(1, -1, 2))

Set up a regression algorithm

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
reg = HistGradientBoostingRegressor(max_iter=5000)

Extract targets (specz redshfits) and features (color and magnitudes) from the data

In [None]:
train_targets, train_features = raruma_util.prepare_data_total_mag_and_colors(train, 'LSST_obs_{band}', 'ugrizy')
test_targets, test_features = raruma_util.prepare_data_total_mag_and_colors(test, 'LSST_obs_{band}', 'ugrizy')

Do some data preparation, scaling the inputs

In [None]:
from sklearn import preprocessing

In [None]:
scaler = preprocessing.StandardScaler().fit(train_features)

In [None]:
scaler.transform(train_features)

In [None]:
scaled = scaler.transform(train_features).clip(-5, 5)

In [None]:
train_features[0:,0].min()

In [None]:
train_features.shape

In [None]:
# from sklearn.cluster import DBSCAN

# db = DBSCAN(eps=0.3, min_samples=10)
# db.fit(train_features)

In [None]:
# labels = np.array(db.labels_)

In [None]:
# np.bincount(labels+1)

Ok, let's PCA this sh*t

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=scaled.shape[-1], whiten=False)

In [None]:
pca.fit(scaled.clip(-5, 5))

In [None]:
pca.explained_variance_ratio_

In [None]:
pca_out = pca.transform(scaled)

Ok, let's make some featue plots...

In [None]:
fig = raruma_plot.plot_feature_histograms(scaled)

In [None]:
fig = raruma_plot.plot_feature_histograms(pca_out)

In [None]:
_ = raruma_plot.plot_pca_hist2d(train_features, pca_out)

In [None]:
_ = raruma_plot.plot_feature_target_hist2d(train_features, train_targets)

In [None]:
_ = raruma_plot.plot_feature_target_hist2d(pca_out, train_targets)

In [None]:
# _ = raruma_plot.plot_features_target_scatter(pca_out, train_targets)

Run an example regression

In [None]:
preds = raruma_util.run_regression(reg, train_features, train_targets, test_features)

In [None]:
_ = raruma_plot.plot_true_predict_fancy(test_targets, preds)