In [1]:
import os
import yaml
import re
import pandas as pd


def extract_float(s):
    # This regex pattern looks for any sequence of digits (\d+), optionally followed by
    # a decimal point and more digits (\.\d+)? The entire pattern is wrapped in parentheses
    # to capture the match as a group.
    match = re.search(r'(\d+(\.\d+)?)', s)
    if match:
        return float(match.group(0))
    else:
        return None

def flatten_dict(d, parent_key='', sep='.'):
    """
    Recursively flattens a nested dictionary and concatenates keys.
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def read_yaml_files(directory):
    data = []
    unique_keys = set()
    # Walk through all directories and subdirectories
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.yaml') or filename.endswith('.yml'):
                filepath = os.path.join(dirpath, filename)
                with open(filepath, 'r') as file:
                    # Load and flatten the YAML file
                    yaml_data = yaml.safe_load(file)
                    flattened_data = flatten_dict(yaml_data)
                    data.append(flattened_data)
                    # Update unique keys set
                    unique_keys.update(flattened_data.keys())
    return data, unique_keys

def write_to_flat_file(data, unique_keys, output_file):
    with open(output_file, 'w') as file:
        # Write the header
        headers = list(unique_keys)
        headers.sort()  # Optionally sort the headers for consistent ordering
        file.write('\t'.join(headers) + '\n')
        # Write the data
        for item in data:
            row = []
            for key in headers:
                row.append(str(item.get(key, "")))
            file.write('\t'.join(row) + '\n')



In [2]:
def extract_float(s):
    try:
        return float(s)
    except ValueError:
        return None

def add_best_acc_to_file(input_file, output_file, directory_prefix):
    # Load the tab-delimited file into a DataFrame
    df = pd.read_csv(input_file, delimiter='\t')
    
    # Ensure the EXPERIMENT.NAME column exists
    if 'EXPERIMENT.NAME' not in df.columns:
        raise ValueError("EXPERIMENT.NAME column not found in the input file.")
    
    # Initialize the BEST_ACC column with NaNs
    df['BEST_ACC'] = float('nan')
    
    # Create a mapping of experiment names to best_acc values
    experiment_acc = {}

    # Search through all levels of the directory
    for root, dirs, files in os.walk(directory_prefix):
        if 'worklog.txt' in files:
            worklog_path = os.path.join(root, 'worklog.txt')
            experiment_name = os.path.basename(root)  # Assumes the experiment name is the folder name
            
            try:
                with open(worklog_path, 'r') as worklog_file:
                    for line in worklog_file:
                        if line.startswith('best_acc'):
                            best_acc_value = extract_float(line.split()[-1])
                            if best_acc_value is not None:
                                experiment_acc[experiment_name] = best_acc_value
                                break
            except Exception as e:
                print(f"Error reading {worklog_path}: {e}")
    
    # Update the DataFrame with best_acc values found
    for index, row in df.iterrows():
        experiment_name = str(row['EXPERIMENT.NAME'])
        if experiment_name in experiment_acc:
            df.at[index, 'BEST_ACC'] = experiment_acc[experiment_name]
    
    # Save the updated DataFrame to a new tab-delimited file
    df.to_csv(output_file, sep='\t', index=False)

In [3]:
#directory in the next cell points to the yamls. 
#and directory_prefix in the next points to the outputs.

In [4]:
#directory = '/projectnb/textconv/distill/mdistiller/configs/cifar100/_bldmse/r328r84/' #mess with this to limit files.  this does all.
#directory = '/projectnb/textconv/distill/mdistiller/configs/cifar100/_sldmse/r56r20/' 
#directory = '/projectnb/textconv/distill/mdistiller/configs/cifar100/_bldmsep/'
#directory = '/projectnb/textconv/distill/mdistiller/configs/cifar100/_bldcd/'
#directory = '/projectnb/textconv/distill/mdistiller/configs/cifar100/_bldcdp/'
#directory = '/projectnb/textconv/distill/mdistiller/configs/cifar100/_bldcdpmp/'
#directory = '/projectnb/textconv/distill/mdistiller/configs/cifar100/_bldcd2p/'
directory = '/projectnb/textconv/distill/mdistiller/configs/cifar100/'

output_file = 'sample_out_w.tsv'

# Process the YAML files
data, unique_keys = read_yaml_files(directory)

# Write the collected data to a flat file
write_to_flat_file(data, unique_keys, output_file)

In [5]:
input_file = 'sample_out_w.tsv'  # The path to your tab-delimited flat file
from datetime import datetime

# Get the current date and time
current_datetime = datetime.now()

# Format the date and time in a way that is safe for filenames
filename_format = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
parts = directory.split('/')
mdistiller_index = parts.index('mdistiller')

# Take everything after 'mdistiller' and join it with underscores
direx = '_'.join(parts[mdistiller_index+1:])
output_file = f'{direx}_{filename_format}.tsv'  # The path for the output file with the BEST_ACC column
directory_prefix = '/projectnb/textconv/distill/mdistiller/output/'  # The prefix to the directory containing experiment folders
#directory_prefix = '/projectnb/textconv/distill/mdistiller/output/imagenet_sldmse'
# Call the function with the specified paths
add_best_acc_to_file(input_file, output_file, directory_prefix)