# Variation Database

## Imports

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Search file structure for variations, and locate config / logs files

In [12]:
# Find config and log files

def find_config_files(root_dir):
    config_files = []
    
    # Walk through the directory structure
    for root, dirs, files in os.walk(root_dir):
        # Check if we are in a "configs" folder
        if os.path.basename(root) == "configs":
            # Search for files ending in "config.yaml"
            for file in files:
                if file.endswith("config.yaml"):
                    config_files.append(os.path.join(root, file))
    
    return config_files

def find_correlation_logs(root_dir):
    correlation_log_files = []
    for root, dirs, files in os.walk(root_dir):
        if os.path.basename(root) == "correlations":
            for subroot, subdirs, subfiles in os.walk(root):
                if os.path.basename(subroot) == "logs":
                    for file in subfiles:
                        if file.endswith('.out'):
                            correlation_log_files.append(os.path.join(subroot, file))
    return correlation_log_files

def find_deltas_logs(root_dir):
    delta_log_files = []
    for root, dirs, files in os.walk(root_dir):
        if os.path.basename(root) == "deltas":
            for subroot, subdirs, subfiles in os.walk(root):
                if os.path.basename(subroot) == "logs":
                    for file in subfiles:
                        if file.endswith('.out'):
                            delta_log_files.append(os.path.join(subroot, file))
    return delta_log_files

def find_fits_logs(root_dir):
    fits_log_files = []
    for root, dirs, files in os.walk(root_dir):
        if os.path.basename(root) == "fits":
            for subroot, subdirs, subfiles in os.walk(root):
                if os.path.basename(subroot) == "logs":
                    for file in subfiles:
                        if file.endswith('.out'):
                            fits_log_files.append(os.path.join(subroot, file))
    return fits_log_files

In [13]:
# Example usage
root_directory = '/pscratch/sd/m/mherbold/tests/'  # Replace with the path to your directory
config_files = find_config_files(root_directory)
correlation_logs = find_correlation_logs(root_directory)
delta_log_files = find_deltas_logs(root_directory)
fits_logs_files = find_fits_logs(root_directory)

# Print or process the files
for config_file in config_files:
    print(f"Config Found: {config_file}")
print('\n')
    
for correlation_log in correlation_logs:
    print(f"Correlation Log Found: {correlation_log}")
print('\n')
    
for delta_log in delta_log_files:
    print(f"Delta Log Found: {delta_log}")
print('\n')
    
for fit_log in fits_logs_files:
    print(f"Fits Log Found: {fit_log}")

Config Found: /pscratch/sd/m/mherbold/tests/bookkeeper/broadband_polynomials/configs/bookkeeper_config.yaml
Config Found: /pscratch/sd/m/mherbold/tests/bookkeeper/version_test/configs/bookkeeper_config.yaml
Config Found: /pscratch/sd/m/mherbold/tests/bookkeeper/baseline/configs/bookkeeper_config.yaml
Config Found: /pscratch/sd/m/mherbold/tests/bookkeeper/no_arinyo/configs/bookkeeper_config.yaml
Config Found: /pscratch/sd/m/mherbold/tests/bookkeeper/lr_max_1200/configs/bookkeeper_config.yaml
Config Found: /pscratch/sd/m/mherbold/tests/bookkeeper/baseline_test/configs/bookkeeper_config.yaml


Correlation Log Found: /pscratch/sd/m/mherbold/tests/bookkeeper/baseline/correlations/logs/dmat_lyalya_lyalyb-29474755.out
Correlation Log Found: /pscratch/sd/m/mherbold/tests/bookkeeper/baseline/correlations/logs/xcf_lyalyb-29474771.out
Correlation Log Found: /pscratch/sd/m/mherbold/tests/bookkeeper/baseline/correlations/logs/xdmat_lyalya-29474763.out
Correlation Log Found: /pscratch/sd/m/mherbold/

## Create a table of info from .yaml files

Search for config file information, such as catalog location, software versions, parameter values, etc.

In [14]:

###################################################################

# Extract module versions from log files

def extract_module_versions(file_path):
    modules = {}
    with open(file_path, "r") as f:
        lines = f.readlines()
    
    for line in lines:
        if " version: " in line:
            line = line.strip()
            try:
                module_info = line.split(" version: ")
                version = module_info[-1].strip()
                module_name = module_info[0].split()[-1].strip()
                modules[module_name] = version
            except ValueError:
                print(f"Could not parse module version line: {line}")
    
    return modules

def gather_log_data(log_files):
    all_modules = {}
    for log_file in log_files:
        modules = extract_module_versions(log_file)
        all_modules.update(modules)
    return ", ".join([f"{mod}: {ver}" for mod, ver in all_modules.items()])

###################################################################

# Search for variation names from paths

def extract_variation_name(file_path):
    return os.path.basename(os.path.dirname(os.path.dirname(file_path)))

###################################################################

# Find info from config.yaml files

def find_keys_in_yaml(file_path, keys):
    with open(file_path, 'r') as f:
        config = yaml.safe_load(f)
    
    values = {}
    
    def get_nested_value(d, key_path):
        keys = key_path.split('.')
        for key in keys:
            if isinstance(d, dict):
                d = d.get(key, None)
            else:
                return None
        return d

    for key in keys:
        values[key] = get_nested_value(config, key)
    
    return values

###################################################################

# Find variation data

def find_variation_data(root_dir):
    data_rows = []

    # Find and process config.yaml files
    config_files = find_config_files(root_dir)

    for file_path in config_files:
        variation_name = extract_variation_name(file_path)
        values = find_keys_in_yaml(file_path, keys_to_extract)
        
        # Prepare row for the table
        row = {'Variation Name': variation_name}
        row.update(values)
        
        # Gather log file data for correlations, deltas, and fits
        correlation_logs = find_correlation_logs(root_dir)
        deltas_logs = find_deltas_logs(root_dir)
        fits_logs = find_fits_logs(root_dir)
        
        row['correlations'] = gather_log_data(correlation_logs)
        row['deltas'] = gather_log_data(deltas_logs)
        row['fits'] = gather_log_data(fits_logs)

        # Add the row to the data rows
        data_rows.append(row)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(data_rows)
    df.to_csv('config_data_with_modules.csv', index=False)
    print(df)

###################################################################


In [16]:
# Specify info from config files to extract

keys_to_extract = [
    'data.healpix data'
    # 'data.catalog',
    # 'delta extraction.dla',
    # 'delta extraction.bal',
    # 'delta extraction.mask file'
]

# Example usage
root_directory = '/pscratch/sd/m/mherbold/tests/'  # Replace with the path to your directory
find_variation_data(root_directory)


          Variation Name                                  data.healpix data  \
0  broadband_polynomials  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
1           version_test  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
2               baseline  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
3              no_arinyo  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
4            lr_max_1200  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
5          baseline_test  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   

  correlations                                deltas  \
0               picca_bookkeeper: x.xx, picca: 9.3.0   
1               picca_bookkeeper: x.xx, picca: 9.3.0   
2               picca_bookkeeper: x.xx, picca: 9.3.0   
3               picca_bookkeeper: x.xx, picca: 9.3.0   
4               picca_bookkeeper: x.xx, picca: 9.3.0   
5               picca_bookkeeper: x.xx, picca: 9.3.0   

                                  fits  
0  picca_boo