In [1]:
import os, sys
root_dir = os.path.dirname(os.getcwd())
sys.path.append(root_dir)

In [2]:
import wandb
import numpy as np
import pandas as pd
from environment import WANDB_INFO
from tqdm import tqdm
from helper import save_pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
select_dim_only = True
update_dim_only = False

In [4]:
api = wandb.Api()

In [5]:
def calculate_aulc(epochs, losses):
    """
    Calculate the Area Under the Learning Curve (AULC)
    
    Parameters:
    epochs (array-like): The x-axis values (typically epoch numbers or iterations)
    losses (array-like): The y-axis values (loss values at each epoch/iteration)
    
    Returns:
    float: The calculated AULC value
    """
    # Ensure inputs are numpy arrays
    epochs = np.array(epochs)
    losses = np.array(losses)
    
    # Sort the arrays by epoch to ensure correct ordering
    sort_indices = np.argsort(epochs)
    epochs = epochs[sort_indices]
    losses = losses[sort_indices]
    
    # Calculate the area using the trapezoidal rule
    area = np.trapz(losses, epochs)
    
    return area

In [6]:
def get_loss(**kwargs):
    if not 'task_type' in kwargs:
        raise Exception('task_type is required')
    
    df = pd.read_pickle(f'{root_dir}/analysis/results.pkl')
    for k, v in kwargs.items():
        df = df[df[k] == v].reset_index(drop=True)
        
    print(f'Found {len(df)} results')
    return df

In [7]:
df = get_loss(
    task_type='regression',
    model = 's4',
    pretrain = True, 
    d_model = 4,
    select_dim_only = select_dim_only,
    update_dim_only = update_dim_only,
)

Found 1152 results


In [8]:
aulc_list= {}
all_rows = list(df.iterrows())
for idx, row in tqdm(all_rows):
    run = api.run(f"{WANDB_INFO['entity']}/{WANDB_INFO['project']}/{row.run_id}")
    history = run.history()
    # Extract the loss values and steps
    steps = history["_step"]
    loss = history["best_loss"]
    plot_df = pd.DataFrame({"steps": steps, "loss": loss})
    # drop nan
    plot_df = plot_df.dropna(subset=["loss"]).reset_index(drop=True)

    aulc = calculate_aulc(plot_df['steps'], plot_df['loss'])
    aulc_list[row.load_ours_config] = aulc
        

  0%|          | 0/1152 [00:00<?, ?it/s]

100%|██████████| 1152/1152 [08:15<00:00,  2.32it/s]


In [9]:
sorted_idxs = sorted(aulc_list, key=aulc_list.get)

unique_set = []
for idx in sorted_idxs:
    if idx not in unique_set:
        unique_set.append(idx)
        
print(f'In total {len(sorted_idxs)} runs!')
sorted_idxs = unique_set

In total 576 runs!


In [10]:
sorted_idxs

[569.0,
 573.0,
 571.0,
 474.0,
 283.0,
 572.0,
 575.0,
 478.0,
 281.0,
 501.0,
 477.0,
 476.0,
 280.0,
 500.0,
 376.0,
 287.0,
 574.0,
 286.0,
 380.0,
 405.0,
 348.0,
 509.0,
 508.0,
 341.0,
 340.0,
 404.0,
 510.0,
 511.0,
 215.0,
 377.0,
 213.0,
 439.0,
 414.0,
 379.0,
 221.0,
 412.0,
 212.0,
 220.0,
 187.0,
 61.0,
 63.0,
 437.0,
 90.0,
 185.0,
 124.0,
 350.0,
 191.0,
 126.0,
 513.0,
 535.0,
 445.0,
 563.0,
 88.0,
 343.0,
 467.0,
 89.0,
 189.0,
 568.0,
 95.0,
 94.0,
 55.0,
 465.0,
 560.0,
 514.0,
 562.0,
 475.0,
 473.0,
 444.0,
 464.0,
 567.0,
 253.0,
 118.0,
 362.0,
 472.0,
 519.0,
 564.0,
 199.0,
 518.0,
 308.0,
 157.0,
 255.0,
 417.0,
 363.0,
 60.0,
 360.0,
 275.0,
 225.0,
 532.0,
 566.0,
 570.0,
 282.0,
 503.0,
 347.0,
 28.0,
 250.0,
 274.0,
 520.0,
 103.0,
 451.0,
 542.0,
 450.0,
 502.0,
 479.0,
 407.0,
 310.0,
 52.0,
 359.0,
 367.0,
 358.0,
 471.0,
 98.0,
 364.0,
 101.0,
 242.0,
 457.0,
 356.0,
 469.0,
 554.0,
 338.0,
 99.0,
 285.0,
 100.0,
 229.0,
 413.0,
 539.0,
 119.0,
 524.

In [11]:
result_path = f'{root_dir}/analysis'
if select_dim_only:
    result_path += '/select_dim_sorted_idxs.pickle'
elif update_dim_only:
    result_path += '/update_dim_sorted_idxs.pickle'
    
os.makedirs(os.path.dirname(result_path), exist_ok=True)
save_pickle(sorted_idxs, result_path)