In [1]:
%cd ../

/home/users/dmoreno2016/VisionTransformers


In [2]:
import pandas as pd
import yaml
import warnings
warnings.filterwarnings("ignore")

from src.data.processing.get_data import get_dataset

In [3]:
def load_yaml(path):
    with open(path, 'r') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    return args


def compute_cadence_per_band(df, snid_col, band_col, time_col, band_mapping=None):
    """
    Computes the mean and median cadence per band from a dataset of light curves.

    Parameters:
    df (pd.DataFrame): A DataFrame containing object ID, band, and time columns.
    snid_col (str): Name of the column representing the object ID.
    band_col (str): Name of the column representing the photometric band.
    time_col (str): Name of the column representing the observation time (MJD).

    Returns:
    pd.DataFrame: A DataFrame with mean and median cadence per band.
    """
    # Sort the DataFrame by object ID, band, and observation time
    df = df.sort_values(by=[snid_col, band_col, time_col])
    df['cadence'] = df.groupby([snid_col, band_col])[time_col].diff()
    df = df.dropna()

    mean_cadence = df.groupby(band_col)['cadence'].mean()
    median_cadence = df.groupby(band_col)['cadence'].median()
    std_cadence = df.groupby(band_col)['cadence'].std()

    # Create a results DataFrame
    cadence_stats = pd.DataFrame({
        'Mean Cadence': mean_cadence, 
        'Median Cadence': median_cadence,
        'Std Dev Cadence': std_cadence
    })

    if band_mapping:
        cadence_stats = cadence_stats.rename(index=band_mapping)

    return cadence_stats

import pandas as pd

def compute_cadence(df, snid_col, time_col):
    """
    Computes the mean, median, and standard deviation of cadence from a dataset of light curves.
    
    Parameters:
    df (pd.DataFrame): A DataFrame containing object ID and time columns.
    snid_col (str): Name of the column representing the object ID.
    time_col (str): Name of the column representing the observation time (MJD).

    Returns:
    pd.Series: A series containing the mean, median, and standard deviation of the cadence.
    """
    df = df.sort_values(by=[snid_col, time_col])
    df['cadence'] = df.groupby(snid_col)[time_col].diff()
    df = df.dropna()

    cadence_stats = {
        'Mean Cadence': df['cadence'].mean(),
        'Median Cadence': df['cadence'].median(),
        'Std Dev Cadence': df['cadence'].std()
    }

    return pd.Series(cadence_stats)

In [4]:
datasets_config = load_yaml(path='configs/datasets_config.yaml')
datasets_config

{'elasticc_1': {'data_dir': 'data/lightcurves/elasticc_1/raw',
  'path_partition': 'data/lightcurves/elasticc_1/ATAT_partition/partitions_v1.parquet',
  'dict_columns': {'snid': 'SNID',
   'mjd': 'MJD',
   'flux': 'FLUXCAL',
   'flux_err': 'FLUXCALERR',
   'band': 'BAND',
   'label': 'label'},
  'all_bands': {'u': 0, 'g': 1, 'r': 2, 'i': 3, 'z': 4, 'Y': 5},
  'dict_mapping_real_classes': {'AGN': 'AGN',
   'CART': 'CART',
   'Cepheid': 'Cepheid',
   'd-Sct': 'Delta Scuti',
   'dwarf-nova': 'Dwarf Novae',
   'EB': 'EB',
   'ILOT': 'ILOT',
   'KN_B19': 'KN',
   'KN_K17': 'KN',
   'Mdwarf-flare': 'M-dwarf Flare',
   'PISN': 'PISN',
   'RRL': 'RR Lyrae',
   'SLSN-I+host': 'SLSN',
   'SLSN-I_no_host': 'SLSN',
   'SNIa-91bg': '91bg',
   'SNIa-SALT2': 'Ia',
   'SNIax': 'Iax',
   'SNIb+HostXT_V19': 'Ib/c',
   'SNIb-Templates': 'Ib/c',
   'SNIc+HostXT_V19': 'Ib/c',
   'SNIc-Templates': 'Ib/c',
   'SNIcBL+HostXT_V19': 'Ib/c',
   'SNII+HostXT_V19': 'II',
   'SNII-NMF': 'II',
   'SNII-Templates': '

In [5]:
name_dataset = 'macho_multiband'
macho_multiband_config = datasets_config[name_dataset]
macho_dataset = get_dataset(macho_multiband_config, name_dataset, debug=False)
macho_dataset

Unnamed: 0,mjd,mag,err,lcid,band
0,48826.73047,-4.817,0.090,11.9111.813,1
1,48829.70312,-5.107,0.063,11.9111.813,1
2,48836.66406,-4.820,0.097,11.9111.813,1
3,48841.67969,-4.935,0.067,11.9111.813,1
4,48855.76172,-4.667,0.087,11.9111.813,1
...,...,...,...,...,...
29217169,51513.62109,-4.920,0.058,79.4658.5443,0
29217170,51525.61328,-5.157,0.080,79.4658.5443,0
29217171,51526.54297,-4.807,0.051,79.4658.5443,0
29217172,51530.50000,-5.162,0.082,79.4658.5443,0


In [6]:
dict_macho_bands_mapping = {v: k for k, v in macho_multiband_config['all_bands'].items()}

# Example usage with df_elasticc_1
cadence_band_macho_results = compute_cadence_per_band(
    macho_dataset, 
    snid_col='lcid', 
    band_col='band', 
    time_col='mjd', 
    band_mapping=dict_macho_bands_mapping
    )

# Display results
print("Cadence statistics per band:")
print(cadence_band_macho_results)

Cadence statistics per band:
      Mean Cadence  Median Cadence  Std Dev Cadence
band                                               
B         3.728875         1.96485         7.037851
R         4.162752         1.96485        11.670963


In [7]:
cadence_macho_results = compute_cadence(
    macho_dataset, 
    snid_col='lcid', 
    time_col='mjd',
    )

cadence_macho_results

Mean Cadence       1.978321
Median Cadence     0.753910
Std Dev Cadence    5.030989
dtype: float64

In [8]:
name_dataset = 'elasticc_1'
elasticc_1_config = datasets_config[name_dataset]
elasticc_1_dataset = get_dataset(elasticc_1_config, name_dataset, debug=False)
elasticc_1_dataset

[              SNID         MJD    FLUXCAL  FLUXCALERR  BAND
 1         55162615  60275.0307  37.621403   12.546493     3
 2         21161797  60275.0822   0.634472    6.241648     3
 3         21161797  60275.0991  11.409507    8.415623     4
 4         69692785  60275.1120  28.230291   18.702690     4
 5         69692785  60275.1440  -8.457952   15.360238     5
 ...            ...         ...        ...         ...   ...
 12288024  49522972  61379.2866   0.330099    8.985371     0
 12288034  16004377  61379.2918  24.625963    3.126866     1
 12288036  41705039  61379.2923  85.104683    3.559689     1
 12288040  16767804  61379.2936  29.126663    3.341172     1
 12288062  49522972  61379.3032  19.520683    4.444035     1
 
 [1762000 rows x 5 columns],
              SNID         MJD      FLUXCAL  FLUXCALERR  BAND
 10       68572360  60275.2703   475.357086   12.076059     3
 14       68572360  60275.2928   559.145508   12.904810     4
 16       68572360  60275.3183   623.651184   37.15

In [9]:
elasticc_1_dataset = pd.concat(elasticc_1_dataset)
elasticc_1_dataset

Unnamed: 0,SNID,MJD,FLUXCAL,FLUXCALERR,BAND
1,55162615,60275.0307,37.621403,12.546493,3
2,21161797,60275.0822,0.634472,6.241648,3
3,21161797,60275.0991,11.409507,8.415623,4
4,69692785,60275.1120,28.230291,18.702690,4
5,69692785,60275.1440,-8.457952,15.360238,5
...,...,...,...,...,...
13678579,6934851,61379.2923,24.895124,3.040788,1
13678585,2029,61379.2941,18.409544,2.873991,1
13678594,48734238,61379.2975,592.740173,5.694680,1
13678598,52116304,61379.2986,7.985216,1.449014,1


In [10]:
dict_elasticc_1_bands_mapping = {v: k for k, v in elasticc_1_config['all_bands'].items()}

# Example usage with df_elasticc_1
cadence_band_elasticc_1_results = compute_cadence_per_band(
    elasticc_1_dataset, 
    snid_col='SNID', 
    band_col='BAND', 
    time_col='MJD', 
    band_mapping=dict_elasticc_1_bands_mapping
    )

# Display results
print("Cadence statistics per band:")
print(cadence_band_elasticc_1_results)

Cadence statistics per band:
      Mean Cadence  Median Cadence  Std Dev Cadence
BAND                                               
u        41.619209          6.0163        89.746704
g        31.610469         10.1082        65.418983
r        16.186151          5.9772        36.864522
i        15.835941          7.9760        35.429004
z        18.704878          7.1258        39.432418
Y        25.821254          8.1241        50.046371


In [11]:
cadence_elasticc_1_results = compute_cadence(
    elasticc_1_dataset, 
    snid_col='SNID', 
    time_col='MJD',
    )

cadence_elasticc_1_results

Mean Cadence        3.857898
Median Cadence      0.933800
Std Dev Cadence    16.180767
dtype: float64