In [1]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np 
import yaml 
from typing import Tuple, Any, Dict, Type, Union, List
import gzip 

new_directory = '/home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/'
os.chdir(new_directory)

from src.utils import ensure_n_elements
with open('src/configuration/paths.yaml', 'r') as file:
    YAML_FILE: Dict[str, Any] = yaml.safe_load(file)

PATHS: Dict[str, str] = YAML_FILE['paths']
PATH_DATA_FOLDER: str =  PATHS['PATH_DATA_FOLDER']
PATH_LIGHT_CURVES_OGLE = PATHS['PATH_LIGHT_CURVES_OGLE']


In [2]:
with open('src/configuration/regressor.yaml', 'r') as file:
    reg_conf_file: Dict[str, Any] = yaml.safe_load(file)

data_sufix: str =   reg_conf_file['model_parameters']['sufix_path']  

if 'LOG' in data_sufix:
    MIN_PERIOD_VALUE = np.log(0.1)
else: 
    MIN_PERIOD_VALUE = 0.1
    
    
with open('src/configuration/nn_config.yaml', 'r') as file:
    nn_config = yaml.safe_load(file)

In [3]:
def load_id_period_to_sample(classes: List[str] = [], period: List[float] = []) -> pd.DataFrame:
    """
    Load and sample data based on specified classes and periods.

    This function loads data from a specified path, filters and samples it according to the given classes and period criteria.
    It supports handling of log-transformed period values.

    Parameters:
    classes (List[str]): List of classes to filter the data. If empty, a random sample is returned.
    period (List[float]): List of period values to use for filtering, corresponding to each class.

    Returns:
    pd.DataFrame: A DataFrame containing the sampled data based on the provided criteria.
    """
    PATH_ZIP_LCs = (PATH_DATA_FOLDER + '/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_' + 
                   data_sufix + '.npy.gz')
    print('Loading from:\n', PATH_ZIP_LCs)
    with gzip.open(PATH_ZIP_LCs, 'rb') as f:
        np_data = np.load(f, allow_pickle=True)

    df = np_data.item()['meta'][['OGLE_id', 'Period', 'Type']]

    black_list = pd.read_csv('data/black_list.csv')

    df = df[~df.OGLE_id.isin(black_list.ID.to_list())]
    if len(classes) == 0: 
        raise('There is not a label for sampling')
    else:
        samples = []
        counter = 0
        for t in classes:
            sample = pd.DataFrame()
            if t == 'ELL':
                sample = df[df['Type'] == 'ECL'].sample(n=1)
            else:
                filtered_df = df[(df['Type'] == t)]
                closest_idx = (filtered_df['Period'] - period[counter]).abs().nsmallest(1).index
                sample = filtered_df.loc[closest_idx]
            samples.append(sample)
            counter += 1
        df = pd.concat(samples, axis=0).reset_index(drop=True)
    return df

In [4]:
def get_only_time_sequence(n=1, star_class=['RRLYR'], period=[1.0]):
    """
    Retrieve time sequences from light curves data for 'n' objects.
    Parameters:
        n (int): Number of objects to sample.
    Returns:
        list: A list of lists containing time sequences from the light curves of 'n' objects.
    """
    
    n = int(n)
    df_id_period = load_id_period_to_sample(star_class, period=period)
    
    df_id_period[['SURVEY', 'FIELD', 'CLASS', 'NUMBER']] = df_id_period['OGLE_id'].str.split('-', expand=True)
    time_sequences = []
    original_sequences = []

    star_counter = 0
    for star in tqdm(star_class, desc='Selecting light curves'):
        if period[star_counter] < MIN_PERIOD_VALUE: 
            period[star_counter] = MIN_PERIOD_VALUE
        
        closest_idx = (df_id_period['Period'] - period[star_counter]).abs().idxmin()

        new_label = df_id_period.loc[closest_idx]['OGLE_id']

        path_lc = (PATH_LIGHT_CURVES_OGLE + new_label.split('-')[1].lower() +
            '/' + new_label.split('-')[2].lower() + '/phot/I/' + new_label + '.dat')

        lcu = pd.read_table(path_lc, sep=" ", names=['time', 'magnitude', 'error'])
        lcu = lcu.reset_index()

        if 'level_0' in lcu.columns: 
            lcu = lcu.dropna(axis=1)
            lcu.columns = ['time', 'magnitude', 'error']

        lcu = lcu.dropna(axis=0) 
        period_i = df_id_period[df_id_period.OGLE_id==new_label].Period.values[0]
        times = lcu['time'].to_list()
        lc_adapted = ensure_n_elements(times)
        lc_adapted_to_real_sequence = ensure_n_elements(times, n=350)
        lc_phased = ((lc_adapted-np.min(lc_adapted))/period_i)%1
        sorted_lc_phased = np.sort(lc_phased)
        time_sequences.append(sorted_lc_phased)
        original_sequences.append(lc_adapted_to_real_sequence)
        star_counter = star_counter + 1
    return time_sequences, original_sequences


In [8]:
for period in range(0, 10, 1): 
    (time, mag) = get_only_time_sequence(n=1, star_class=['RRLYR'], period=[np.log(period/100)])
    print(np.min(time), np.max(time), np.min(mag), np.max(mag))

  


Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 143.23it/s]


0.0 0.9985138690390158 2414.88032 4955.82864
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 147.02it/s]


0.0 0.9986400721411322 2125.59624 4944.73513
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 149.72it/s]


0.0 0.9986400721411322 2136.5195 4953.82998
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 138.80it/s]


0.0 0.9986400721411322 2125.59624 4954.79964
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 143.68it/s]


0.0 0.9986400721411322 2129.67387 4950.85062
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 144.86it/s]


0.0 0.9989802358022644 2125.59624 4953.82998
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 151.39it/s]


0.0 0.9986400721411322 2125.59624 4950.85062
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 150.94it/s]


0.0 0.9986400721411322 2136.5195 4953.82998
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 149.15it/s]


0.0 0.9986400721411322 2125.59624 4954.79964
Loading from:
 /home/franciscoperez/Documents/GitHub/CNN-PELSVAE2/cnn-pels-vae/data/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_GAIA3_LOG_6PP.npy.gz


Selecting light curves: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 142.93it/s]

0.0 0.9986400721411322 2125.59624 4950.85062





In [6]:
'''
PATH_ZIP_LCs = (PATH_DATA_FOLDER + '/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_' + 
               data_sufix + '.npy.gz')
print('Loading from:\n', PATH_ZIP_LCs)
with gzip.open(PATH_ZIP_LCs, 'rb') as f:
    np_data = np.load(f, allow_pickle=True)

df_GAIA = np_data.item()['meta'][['OGLE_id', 'Period', 'Type']]
black_list = []
white_list = []
for idx in tqdm(range(df_GAIA.shape[0]), desc='Selecting light curves'):
    try:
        new_label = df_GAIA.loc[idx]['OGLE_id']
        path_lc = (PATH_LIGHT_CURVES_OGLE + new_label.split('-')[1].lower() +
            '/' + new_label.split('-')[2].lower() + '/phot/I/' + new_label + '.dat')
        lcu = pd.read_table(path_lc, sep=" ", names=['time', 'magnitude', 'error'])
        if (lcu.shape[0] > nn_config['data']['minimum_lenght_real_curves']) and (lcu['time'].is_monotonic_increasing):
            white_list.append(new_label)
        else: 
            black_list.append(new_label)
    except: 
        black_list.append(new_label)
pd.DataFrame(black_list, columns=['ID']).to_csv('data/black_list.csv')
len(black_list)
len(white_list)
'''

'\nPATH_ZIP_LCs = (PATH_DATA_FOLDER + \'/time_series/real/OGLE3_lcs_I_meta_snr5_augmented_folded_trim600_\' + \n               data_sufix + \'.npy.gz\')\nprint(\'Loading from:\n\', PATH_ZIP_LCs)\nwith gzip.open(PATH_ZIP_LCs, \'rb\') as f:\n    np_data = np.load(f, allow_pickle=True)\n\ndf_GAIA = np_data.item()[\'meta\'][[\'OGLE_id\', \'Period\', \'Type\']]\nblack_list = []\nwhite_list = []\nfor idx in tqdm(range(df_GAIA.shape[0]), desc=\'Selecting light curves\'):\n    try:\n        new_label = df_GAIA.loc[idx][\'OGLE_id\']\n        path_lc = (PATH_LIGHT_CURVES_OGLE + new_label.split(\'-\')[1].lower() +\n            \'/\' + new_label.split(\'-\')[2].lower() + \'/phot/I/\' + new_label + \'.dat\')\n        lcu = pd.read_table(path_lc, sep=" ", names=[\'time\', \'magnitude\', \'error\'])\n        if (lcu.shape[0] > nn_config[\'data\'][\'minimum_lenght_real_curves\']) and (lcu[\'time\'].is_monotonic_increasing):\n            white_list.append(new_label)\n        else: \n            black_l