In [None]:
import tables_io
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import (HistGradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import HuberRegressor, LinearRegression, QuantileRegressor
from sklearn.svm import NuSVR
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn import preprocessing
from sklearn.decomposition import PCA
from rail.raruma import plotting_functions

In [None]:
# pz_dir = '/global/cfs/cdirs/lsst/groups/PZ/DP1'
# if that fails you can use this
# pz_dir = '/global/u2/e/echarles/dx'
pz_dir = '/Users/echarles/pz'

In [None]:
train = tables_io.read(f'{pz_dir}/sandbox_data/ecdfs/lsst_cat_matched_nonan_train.hdf5')
test = tables_io.read(f'{pz_dir}/sandbox_data/ecdfs/lsst_cat_matched_nonan_test.hdf5')


In [None]:
def make_mask(t):
    mask = (t['refExtendedness'] == 1) *\
           (t['i_blendedness'] < 0.01) *\
           (t['g_psfFlux']/ t['g_psfFluxErr'] > 10) *\
           (t['i_psfFlux'] / t['i_psfFluxErr'] > 10) *\
           (t['r_psfFlux'] / t['r_psfFluxErr'] > 10) *\
           (t['z_psfFlux'] / t['z_psfFluxErr'] > 10)
    return mask

In [None]:
def get_features(t):
    sersic_x = t['sersic_reff_x']  
    sersic_y = t['sersic_reff_y']
    sersic_trace = np.nan_to_num(sersic_x*sersic_x + sersic_y*sersic_y, nan=0)
    sersic_index = np.nan_to_num(t['sersic_index'], 0)

    g_trace = np.nan_to_num(t['g_ixx'] + t['g_iyy'], nan=0)
    z_trace = np.nan_to_num(t['z_ixx'] + t['z_iyy'], nan=0)

    feature_list = [sersic_index, g_trace, z_trace, sersic_trace]
    feature_list += [np.nan_to_num(t[f'{band}_psfMag'], nan=30) for band in 'ugrizy']
    
    return np.vstack(feature_list).T


In [None]:
def run_regression(
    regerssor,
    train_features: np.ndarray,
    train_targets: np.ndarray,
    test_features: np.ndarray,
) -> np.ndarray:
    breakpoint()
    scaler = preprocessing.StandardScaler().fit(train_features)
    scaled_train = scaler.transform(train_features).clip(-5, 5)    
    pca = PCA(n_components=train_features.shape[-1], whiten=False).fit(scaled_train)    
    pca_train = pca.transform(scaled_train)    
    scaled_test = scaler.transform(test_features).clip(-5, 5)
    pca_test = pca.transform(scaled_test)
    regerssor.fit(pca_train, train_targets)
    return regerssor.predict(pca_test)

In [None]:
hbr = HistGradientBoostingRegressor()
etr = ExtraTreesRegressor()
abr = AdaBoostRegressor()
gpr = GaussianProcessRegressor()
isr = IsotonicRegression()
hur = HuberRegressor()
lir = LinearRegression()
qur = QuantileRegressor()
nsr = NuSVR()
knr = KNeighborsRegressor()
rnr = RadiusNeighborsRegressor()

In [None]:
mask_test = make_mask(test)
mask_train = make_mask(train)

In [None]:
features_test = get_features(test)[mask_test]
features_train = get_features(train)[mask_train]

targets_test = test['redshift'][mask_test]
targets_train = train['redshift'][mask_train]

In [None]:
def run_it(reg):
    preds = run_regression(reg, features_train[:,4:], targets_train, features_test[:,4:])
    _ = plotting_functions.plot_true_predict_fancy(targets_test, np.nan_to_num(preds, -0.4))
    _ = plotting_functions.plot_biweight_stats_v_redshift(targets_test, preds)

In [None]:
def run_it_size(reg):
    preds = run_regression(reg, features_train[:,3:], targets_train, features_test[:,3:])
    _ = plotting_functions.plot_true_predict_fancy(targets_test, np.nan_to_num(preds, -0.4))
    _ = plotting_functions.plot_biweight_stats_v_redshift(targets_test, preds)    

In [None]:
def run_it_sizes(reg):
    preds = run_regression(reg, features_train[:,1:], targets_train, features_test[:,1:])
    _ = plotting_functions.plot_true_predict_fancy(targets_test, np.nan_to_num(preds, -0.4))
    _ = plotting_functions.plot_biweight_stats_v_redshift(targets_test, preds)    

In [None]:
def run_it_all(reg):
    preds = run_regression(reg, features_train, targets_train, features_test)
    _ = plotting_functions.plot_true_predict_fancy(targets_test, np.nan_to_num(preds, -0.4))
    _ = plotting_functions.plot_biweight_stats_v_redshift(targets_test, preds)

In [None]:
run_it(etr)

In [None]:
run_it_size(etr)

In [None]:
run_it_sizes(etr)

In [None]:
run_it_all(etr)