In [1]:
from typing import Tuple, Any, Dict, Type, Union, List, Optional
import numpy as np
from src.vae.vae_models import *
from src.vae.datasets import Astro_lightcurves
from src.utils import *
import src.sampler.fit_regressor as reg
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

In [2]:
with open('src/paths.yaml', 'r') as file:
    YAML_FILE: Dict[str, Any] = yaml.safe_load(file)

PATHS: Dict[str, str] = YAML_FILE['paths']
PATH_PRIOS: str = PATHS['PATH_PRIOS']
PATH_MODELS: str = PATHS['PATH_MODELS']
mean_prior_dict: Dict[str, Any] = load_yaml_priors(PATH_PRIOS)

with open('src/regressor.yaml', 'r') as file:
    config_file: Dict[str, Any] = yaml.safe_load(file)

vae_model: str =   config_file['model_parameters']['ID']   # '1pjeearx'#'20twxmei' trained using TPM using GAIA3 ... using 5 PP 1pjeearx
print('Using vae model: '+ vae_model)

Using vae model: 1ygxtz38


In [3]:
phys2 = ['abs_Gmag', 'teff_val', 'Period', 'radius_val', '[Fe/H]_J95', 'logg']
print('training regressor using vae latent space')
gpu = reg_conf_file['model_parameters']['gpu'] #True 
print('vae model: ', vae_model)
vae, config, _ = reg.setup_environment(vae_model, gpu)
dataset = reg.prepare_dataset(config)
train_dataloader, _ = dataset.get_dataloader(batch_size=100, test_split=0.0, shuffle=False)
mu, _ = evaluate_encoder(vae, train_dataloader, config, force=False)
meta_ = dataset.meta.dropna(subset=phys2)
mu_ = mu.iloc[:, :-1].values
mu_ = mu_.astype(np.float64)
print('meta: ', meta_.shape)
print('mu: ', mu_.shape)
print('objects by class')
print(meta_.Type.value_counts())
print('objects by class deleting duplicated objects')
print( meta_.reset_index().drop_duplicates(subset=['OGLE_id']).Type.value_counts())

training regressor using vae latent space
vae model:  1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae 1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/config.yaml
Loading from... 
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
Is model in cuda?  True
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_IMPUTED_6PP.npy.gz
phy_params:  PTARMG
lcs shape:  (62858, 600, 3)
labels shape:  (62858,)
meta shape:  (62858, 8)
meta_p shape:  (62858, 6)
labels one hot shape:  (62858, 8)
labels int shape:  (62858,)
Evaluating Encoder...
Elapsed time  : 36.00 s
########################################
mu_df.shape:  (62858, 5)
std_df.shape:  (62858, 5)

In [12]:
def train_model_polynomial_trick(regressor: Type, config_dic: Dict[str, Any], name: str, p: np.ndarray, z: np.ndarray,
                                 test_size=0.0, degree=3) -> Any:
    # Initialize Polynomial Features
    poly = PolynomialFeatures(degree=degree)
    
    # Apply polynomial transformation to your input features
    p_poly = poly.fit_transform(p)
    
    # Initialize the model
    model = regressor(**config_dic[name])
    
    try:
        print('p: ', p.shape)
        print('z: ', z.shape)
        print('Using {} to test'.format(test_size))
        
        if test_size > 0:
            # Split the data into training and testing sets
            p_train, p_test, z_train, z_test = train_test_split(p_poly, z, test_size=test_size, random_state=42)
            
            # Fit the Random Forest model on transformed features
            model.fit(p_train, z_train)
            
            # Predict on the test data
            z_pred = model.predict(p_test)
            
            reg.print_metrics_regression(z_test, z_pred)
            
        else:
            print('Fitting model') 
            
            # Fit the Random Forest model on transformed features
            model.fit(p_poly, z)
            
            print('Predicting latent space')
            
            # Predict on the entire data
            z_pred = model.predict(p_poly)
            
            print('Getting metrics')
            reg.print_metrics_regression(z, z_pred)
            
    except MemoryError:
        print('Fail')
        
    return model

def apply_regression(vae_model, samples: Union[np.ndarray, List] = None,  from_vae: bool = False, 
        train_rf: bool = True, 
        phys2 = ['abs_Gmag', 'teff_val', 'Period', 'radius_val', '[Fe/H]_J95', 'logg'], 
        polynomial_trick = False) -> None:

    print('training regressor using vae latent space')
    gpu = reg_conf_file['model_parameters']['gpu'] #True 
    print('vae model: ', vae_model)
    vae, config, _ = reg.setup_environment(vae_model, gpu)
    dataset = reg.prepare_dataset(config)
    train_dataloader, _ = dataset.get_dataloader(batch_size=100, test_split=0.0, shuffle=False)
    mu, _ = evaluate_encoder(vae, train_dataloader, config, force=False)
    meta_ = dataset.meta.dropna(subset=phys2)
    mu_ = mu.iloc[:, :-1].values
    mu_ = mu_.astype(np.float64)
    print('meta: ', meta_.shape)
    print('mu: ', mu_.shape)
    unique_idx = meta_.reset_index().drop_duplicates(subset=['OGLE_id']).index
    meta_ = meta_.iloc[unique_idx]
    mu_ = mu_[unique_idx]
    print('after dropping duplicated objects')
    print('meta: ', meta_.shape)
    print('mu: ', mu_.shape)
    #TODO: check if assess by class or using all the samples
    _, _ = process_regressors(reg_conf_file, phys2=phys2, meta_= meta_, mu_=mu_,
                              samples= None, from_vae=from_vae, 
                              train_rf=True, polynomial_trick = polynomial_trick)
    return _, _

def process_regressors(reg_conf_file: Dict[str, Any],
                       meta_: Optional[Dict[str, Any]] = None,
                       phys2: Optional[List[str]] = None,
                       mu_: Optional[List[float]] = None,
                       samples: Optional[List[float]] = None,
                       from_vae: bool = True,
                       train_rf: bool = False, 
                       grid_search: bool = False, 
                      polynomial_trick = False) -> None:
    
    regressors = {'RFR': RandomForestRegressor}
    config_dict = dict(reg_conf_file['regressors'])
    #print(config_dict)
    if from_vae:
        if (meta_ is None) or (phys2 is None) or (mu_ is None):
            raise ValueError("Meta data, phys2, and mu_ must be provided if assess_regressor is True.")
        p = meta_[phys2].copy()
        #print(p.max())
        z = mu_.copy()
    else:
        raise
    
    for name, regressor in regressors.items():
        filename = 'models/' + reg_conf_file['model_parameters']['ID']+'_'+name + 'TEST_TO_ASSESS_CLASSES.pkl'
        if train_rf:
            print(f"Training new model {name}")
            if grid_search:
                raise
            else:
                print('Grid search was not applied')
                # Split data into training and testing sets
                #model = reg.train_model(regressor, config_dict, name, p, z, test_size=0.0)
                if polynomial_trick: 
                    model = train_model_polynomial_trick(regressor, config_dict, name, p, z, test_size=0.0)
                else: 
                    model = reg.train_model(regressor, config_dict, name, p, z, test_size=0.0)
                
            reg.save_model(model, filename=filename)
            model = pickle.load(open(filename, 'rb'))
            #z_hat = model.predict(p)
            #reg.print_metrics_regression(z, z_hat)
            return z, _

        else:
            print(f"Loading existing model from {filename}")
            model = pickle.load(open(filename, 'rb'))
            print('Getting predictions')
            if polynomial_trick:
                # Initialize Polynomial Features
                poly = PolynomialFeatures(degree=3)
                # Apply polynomial transformation to your input features
                p = poly.fit_transform(p)
            
            z_hat = model.predict(p)
            
            reg.print_metrics_regression(z, z_hat)
            return z_hat, z

def apply_regression_by_class(vae_model,  phys2 = ['abs_Gmag', 'teff_val', 'Period', 'radius_val', '[Fe/H]_J95', 'logg'], polynomial_trick =False) -> None:

    print('regressor using vae latent space')
    gpu = reg_conf_file['model_parameters']['gpu'] #True 
    print('vae model: ', vae_model)
    vae, config, _ = reg.setup_environment(vae_model, gpu)
    dataset = reg.prepare_dataset(config)
    train_dataloader, _ = dataset.get_dataloader(batch_size=100, test_split=0.0, shuffle=False)
    mu, _ = evaluate_encoder(vae, train_dataloader, config, force=False)
    meta_ = dataset.meta.dropna(subset=phys2)
    mu_ = mu.iloc[:, :-1].values
    mu_ = mu_.astype(np.float64)
    print('meta: ', meta_.shape)
    print('mu: ', mu_.shape)
    uniques_meta = meta_.drop_duplicates(subset=['OGLE_id'])
    for type in meta_.Type.unique():
        unique_idx = uniques_meta[uniques_meta.Type==type].index
        meta_train = meta_.iloc[unique_idx]
        mu_train = mu_[unique_idx]
        print('#'*20)
        print('Type: ', type)
        print('after dropping duplicated objects')
        print('meta class: ', meta_train.shape)
        print('mu class: ', mu_train.shape)
        #TODO: check if assess by class or using all the samples
        z_hat, z = process_regressors(reg_conf_file, phys2=phys2, meta_= meta_train, mu_=mu_train, from_vae=True, polynomial_trick=polynomial_trick)
    return z_hat, z

In [5]:
#!pip install tqdm

# Train model

In [6]:
z, z_hat= apply_regression(vae_model, from_vae = True, train_rf = True, 
                           phys2 = ['abs_Gmag', 'teff_val', 'Period', 'radius_val', '[Fe/H]_J95', 'logg'], 
                          polynomial_trick=False)

training regressor using vae latent space
vae model:  1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae 1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/config.yaml
Loading from... 
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
Is model in cuda?  True
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_IMPUTED_6PP.npy.gz
phy_params:  PTARMG
lcs shape:  (62858, 600, 3)
labels shape:  (62858,)
meta shape:  (62858, 8)
meta_p shape:  (62858, 6)
labels one hot shape:  (62858, 8)
labels int shape:  (62858,)
Evaluating Encoder...
Elapsed time  : 36.00 s
########################################
mu_df.shape:  (62858, 5)
std_df.shape:  (62858, 5)

In [7]:
z_hat, z = apply_regression_by_class(vae_model, phys2 = ['abs_Gmag', 'teff_val', 'Period', 'radius_val', '[Fe/H]_J95', 'logg'])

regressor using vae latent space
vae model:  1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae 1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/config.yaml
Loading from... 
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
Is model in cuda?  True
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_IMPUTED_6PP.npy.gz
phy_params:  PTARMG
lcs shape:  (62858, 600, 3)
labels shape:  (62858,)
meta shape:  (62858, 8)
meta_p shape:  (62858, 6)
labels one hot shape:  (62858, 8)
labels int shape:  (62858,)
Evaluating Encoder...
Elapsed time  : 29.00 s
########################################
mu_df.shape:  (62858, 5)
std_df.shape:  (62858, 5)
meta:  (

In [8]:
z, z_hat= apply_regression(vae_model, from_vae = True, train_rf = True, 
                           phys2 = ['abs_Gmag', 'teff_val', 'Period', 'radius_val', '[Fe/H]_J95', 'logg'], 
                          polynomial_trick=True)

training regressor using vae latent space
vae model:  1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae 1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/config.yaml
Loading from... 
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
Is model in cuda?  True
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_IMPUTED_6PP.npy.gz
phy_params:  PTARMG
lcs shape:  (62858, 600, 3)
labels shape:  (62858,)
meta shape:  (62858, 8)
meta_p shape:  (62858, 6)
labels one hot shape:  (62858, 8)
labels int shape:  (62858,)
Evaluating Encoder...
Elapsed time  : 25.00 s
########################################
mu_df.shape:  (62858, 5)
std_df.shape:  (62858, 5)

In [13]:
z_hat, z = apply_regression_by_class(vae_model, phys2 = ['abs_Gmag', 'teff_val', 'Period', 'radius_val', '[Fe/H]_J95', 'logg'], polynomial_trick=True)

regressor using vae latent space
vae model:  1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae 1ygxtz38
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/config.yaml
Loading from... 
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/wandb/run--1ygxtz38/VAE_model_fragrant-eon-178.pt
Is model in cuda?  True
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_IMPUTED_6PP.npy.gz
phy_params:  PTARMG
lcs shape:  (62858, 600, 3)
labels shape:  (62858,)
meta shape:  (62858, 8)
meta_p shape:  (62858, 6)
labels one hot shape:  (62858, 8)
labels int shape:  (62858,)
Evaluating Encoder...
Elapsed time  : 34.00 s
########################################
mu_df.shape:  (62858, 5)
std_df.shape:  (62858, 5)
meta:  (