# Variation Database

## Imports

In [55]:
import vega as vg
import picca
import picca_bookkeeper

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import yaml


## Search file structure for variations and config files

In [56]:
def find_config_files(root_dir):
    config_files = []
    
    # Walk through the directory structure
    for root, dirs, files in os.walk(root_dir):
        # Check if we are in a "configs" folder
        if os.path.basename(root) == "configs":
            # Search for files ending in "config.yaml"
            for file in files:
                if file.endswith("config.yaml"):
                    config_files.append(os.path.join(root, file))
    
    return config_files

# Example usage
root_directory = '/pscratch/sd/m/mherbold/tests/'  # Replace with the path to your directory
config_files = find_config_files(root_directory)

# Print or process the files
for config_file in config_files:
    print(f"Found: {config_file}")


Found: /pscratch/sd/m/mherbold/tests/bookkeeper/broadband_polynomials/configs/bookkeeper_config.yaml
Found: /pscratch/sd/m/mherbold/tests/bookkeeper/version_test/configs/bookkeeper_config.yaml
Found: /pscratch/sd/m/mherbold/tests/bookkeeper/baseline/configs/bookkeeper_config.yaml
Found: /pscratch/sd/m/mherbold/tests/bookkeeper/no_arinyo/configs/bookkeeper_config.yaml
Found: /pscratch/sd/m/mherbold/tests/bookkeeper/lr_max_1200/configs/bookkeeper_config.yaml
Found: /pscratch/sd/m/mherbold/tests/bookkeeper/baseline_test/configs/bookkeeper_config.yaml


## Create a table of info from .yaml files

Search for config file information, such as catalog location, software versions, parameter values, etc.

In [57]:
def extract_variation_name(file_path):
    return os.path.basename(os.path.dirname(os.path.dirname(file_path)))

def find_keys_in_yaml(file_path, keys):
    with open(file_path, 'r') as f:
        config = yaml.safe_load(f)
    
    values = {}
    
    def get_nested_value(d, key_path):
        keys = key_path.split('.')
        for key in keys:
            if isinstance(d, dict):
                d = d.get(key, None)
            else:
                return None
        return d

    for key in keys:
        values[key] = get_nested_value(config, key)
    
    return values

def extract_module_versions(file_path):
    modules = {}
    with open(file_path, "r") as f:
        lines = f.readlines()
    for line in lines:
        if line.startswith("# ") and " version: " in line:
            line = line.strip()
            module_info = line[2:]
            module_name, version = module_info.split(" version: ")
            modules[module_name] = version
    return modules

root_directory = '/pscratch/sd/m/mherbold/tests/bookkeeper'  # Replace with the path to your directory

data_rows = []

# Specify info from config files to extract
# note the key structure using "." to decend in level
keys_to_extract = [
    'data.healpix data',
    'data.catalog',
    'delta extraction.dla',
    'delta extraction.bal',
    'delta extraction.mask file'
]

all_modules = set()

for root, dirs, files in os.walk(root_directory):
    if os.path.basename(root) == "configs":
        for file in files:
            if file.endswith("config.yaml"):
                file_path = os.path.join(root, file)
                modules = extract_module_versions(file_path)
                all_modules.update(modules.keys())

all_modules = sorted(all_modules)

for root, dirs, files in os.walk(root_directory):
    if os.path.basename(root) == "configs":
        for file in files:
            if file.endswith("config.yaml"):
                file_path = os.path.join(root, file)
                variation_name = extract_variation_name(file_path)
                values = find_keys_in_yaml(file_path, keys_to_extract)
                modules = extract_module_versions(file_path)
                
                row = {'Variation Name': variation_name}
                row.update(values)
                
                for module in all_modules:
                    row[module] = modules.get(module, 'N/A')
                
                data_rows.append(row)

df = pd.DataFrame(data_rows)
df.to_csv('config_data_with_modules.csv', index=False)
print(df)


          Variation Name                                  data.healpix data  \
0  broadband_polynomials  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
1           version_test  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
2               baseline  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
3              no_arinyo  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
4            lr_max_1200  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   
5          baseline_test  /dvs_ro/cfs/cdirs/desi/spectro/redux/jura/healpix   

                                        data.catalog  \
0  /global/cfs/cdirs/desicollab/science/lya/y3/ju...   
1  /global/cfs/cdirs/desicollab/science/lya/y3/ju...   
2  /global/cfs/cdirs/desicollab/science/lya/y3/ju...   
3  /global/cfs/cdirs/desicollab/science/lya/y3/ju...   
4  /global/cfs/cdirs/desicollab/science/lya/y3/ju...   
5  /global/cfs/cdirs/desicollab/science/lya/y3/ju...   

                                delta extraction.dla 