In [None]:
import pandas as pd
import pre_processing as prep


# Preprocess
def preprocess(df: pd.DataFrame):
    """ Renaming columns to more generalized format (e.g., "worker" instead of "worker1") """
    df.columns = [prep.rename_workers(col) for col in df.columns]
    df.columns = [prep.rename_container_columns(col) for col in df.columns]

    """ Filtering out columns that are useless or difficult to use (e.g., process specific metrics) """
    filtered_columns = [col for col in df.columns if prep.filter_namespaces(col)]
    df = df[filtered_columns]

    filtered_columns = [col for col in df.columns if prep.filter_go_specific(col)]
    df = df[filtered_columns]

    filtered_columns = [col for col in df.columns if prep.filter_process_specific(col)]
    df = df[filtered_columns]

    filtered_columns = [col for col in df.columns if prep.filter_durations(col)]
    df = df[filtered_columns]

    filtered_columns = [col for col in df.columns if prep.filter_experiment_specific(col)]
    df = df[filtered_columns]

    """ Convert monontonically increasing cols to rates (counters to gauges) (e.g., total joules to joules per unit of time) """
    counters = prep.get_counters(df)
    df = prep.convert_to_rates(df, counters)
    filtered_columns = [col for col in df.columns if col not in counters]
    df = df[filtered_columns]
    df = df.copy()  # Get defragmented copy of the df after all the edits (probably does nothing useful)
    return df


# Save feather

In [None]:
import os

# Get all model subfolders
root_folder = '../data/minimized_run_3'
filename = 'worker1.feather'
model_folders = prep.find_subfolders_with_file(root_folder, filename)
print(model_folders)

# For each worker feather
save_folder = "preprocessed_workers/"
for model_folder in model_folders:
    last_folder = os.path.basename(model_folder.strip('/'))
    for worker in range(1,6):
        print(f"Preprocessing worker {worker}")
        path = f"{model_folder}/worker{worker}.feather"
        df = pd.read_feather(path)
        df = preprocess(df)
        save_path = f"{save_folder}/{last_folder}/"
        os.makedirs(save_path, exist_ok=True)
        df.to_feather(f"{save_path}worker{worker}.feather")
        print(f"Saved worker{worker} to {save_path}worker{worker}.feather")