In [None]:
from lossmoother import LosSmoother
import json
import pandas as pd
import numpy as np
import random

def _subsample_logspace(length: int, gap: float) -> np.ndarray:
    """
    Subsample steps [1, length] in log10 space with bin width `gap`,
    always including the last step.
    """
    pos = np.arange(length)          # 0, 1, ..., length-1
    logs = np.log10(pos + 1)         # log10 of steps 1..length
    bins = np.floor(logs / gap).astype(int)

    # Traverse positions in reverse so we keep the last step per bin
    rev_pos = pos[::-1]
    _, first_rev_idx = np.unique(bins[rev_pos], return_index=True)

    # Map back to original positions, sort, then convert to 1-based steps
    chosen_steps = np.sort(rev_pos[first_rev_idx]) + 1
    return chosen_steps


def _create_datapoints(length, gap: float = 0.01, feature_cutoff: float = 0.6, target_cutoff: float = 0.9, max_datapoints: int = 100):
    samples = _subsample_logspace(length, gap)
    cutoff = lambda c: int(len(samples) * c)
    pairs = [
        (feature_step, target_step)
        for feature_step in samples[cutoff(feature_cutoff):]
        for target_step in samples[cutoff(target_cutoff):]
        if feature_step < target_step
    ]
    if len(pairs) > max_datapoints:
        pairs = random.sample(pairs, max_datapoints)
    return zip(*pairs)


def create_dataset(path: str = 'src/runs_data.json', total: int = 4300, feature_callables: list[callable] = []) -> pd.DataFrame:
    """
    Process runs_experiments and create a dataframe with processed loss data.
    """ 
    with open(path, 'r') as f:
        runs_experiments = json.load(f)
    
    runs_data = {}
    for run in runs_experiments:
        if len(run['train_loss']) > total:         
            feature_steps, target_steps = _create_datapoints(len(run['train_loss'][:total]))
            runs_data[run['run_id']] = {
                'raw_losses': run['train_loss'][:total],
                'feature_steps': feature_steps,
                'target_steps': target_steps,
                'delta_steps': [t - f for t, f in zip(target_steps, feature_steps)],
            }

    # Preprocess losses for each run using LosSmoother and collect target losses
    for run_data in runs_data.values():
        lossmother = LosSmoother()
        run_data['preprocessed_losses'] = [lossmother.update(loss)[1] for loss in run_data['raw_losses']]
        run_data['target_losses'] = [run_data['preprocessed_losses'][t-1] for t in run_data['target_steps']]

    df = pd.DataFrame(
        {'run_id': run_id, 'feature_step': f_step, 'target_step': t_step, 'target_loss': t_loss, 'delta_steps': delta_steps}
        for run_id, run_data in runs_data.items()
        for f_step, t_step, t_loss, delta_steps in zip(run_data['feature_steps'], run_data['target_steps'], run_data['target_losses'], run_data['delta_steps'])
    )

    for feature_fn in feature_callables:
        for idx, row in df.iterrows():
            cutoff_loss = runs_data[row['run_id']]['preprocessed_losses'][:row['feature_step']] # step is already + 1, so it will be inclusded
            df.loc[idx, feature_fn.__name__] = feature_fn(cutoff_loss)

    return df



def last_loss(loss: list) -> float:
    return loss[-1]

def derivative(loss: list) -> float:
    last_loss = loss[-1]
    for i in range(len(loss)-1, 0, -1):
        if loss[i] != last_loss:
            break
    else:
        return 0
    return (loss[i] - last_loss) / (i - len(loss))


df = create_dataset(path='runs_data.json', feature_callables=[last_loss, derivative])
df

Unnamed: 0,run_id,feature_step,target_step,target_loss,delta_steps,last_loss,derivative
0,a9fa312754c8461e9e6920f6567958f5,1148,3981,0.447833,2833,0.480681,-7.317593e-06
1,a9fa312754c8461e9e6920f6567958f5,3467,4300,0.447104,833,0.449832,-9.536021e-06
2,a9fa312754c8461e9e6920f6567958f5,831,3981,0.447833,3150,0.499638,-6.644935e-06
3,a9fa312754c8461e9e6920f6567958f5,1288,3311,0.450306,2023,0.476144,-2.659880e-05
4,a9fa312754c8461e9e6920f6567958f5,3090,3715,0.448127,625,0.451319,-1.136788e-06
...,...,...,...,...,...,...,...
1895,4f09218fe6d1488985d184547a6e4c4f,467,3981,0.457133,3514,0.517019,-1.020339e-07
1896,4f09218fe6d1488985d184547a6e4c4f,2454,2951,0.462584,497,0.466161,-2.085648e-05
1897,4f09218fe6d1488985d184547a6e4c4f,724,3981,0.457133,3257,0.503851,-5.008175e-05
1898,4f09218fe6d1488985d184547a6e4c4f,1258,4168,0.456692,2910,0.489097,-2.907030e-05


In [None]:
import numpy as np


def _subsample_uniformly_in_logspace(df, col, gap):
    """
    Uniformly subsample rows by uniquely binning `col` with width `gap`.
    """
    key = np.floor(df[col] / gap).astype(int)
    return df.loc[key.drop_duplicates().index]


def _create_datapoints(length, gap: int, feature_cutoff = 0.4, target_cutoff = 0.1):
    assert gap > 0
    target_ids = list(range(int(length - length * target_cutoff), length + 1, gap))
    feature_ids = list(range(int(length - length * feature_cutoff), length + 1, gap))
    feature_steps = [f+1 for f in feature_ids for t in target_ids if f < t]
    target_steps = [t+1 for f in feature_ids for t in target_ids if f < t]
    return feature_steps, target_steps



a, b = _create_datapoints(4000, 1)
len(a), len(b), a[-10:], a[:10]

(561400,
 561400,
 [3997, 3997, 3997, 3997, 3998, 3998, 3998, 3999, 3999, 4000],
 [2401, 2401, 2401, 2401, 2401, 2401, 2401, 2401, 2401, 2401])

In [40]:
import numpy as np
import random

def _subsample_logspace(length: int, gap: float) -> np.ndarray:
    """
    Subsample steps [1, length] in log10 space with bin width `gap`,
    always including the last step.
    """
    pos = np.arange(length)          # 0, 1, ..., length-1
    logs = np.log10(pos + 1)         # log10 of steps 1..length
    bins = np.floor(logs / gap).astype(int)

    # Traverse positions in reverse so we keep the last step per bin
    rev_pos = pos[::-1]
    _, first_rev_idx = np.unique(bins[rev_pos], return_index=True)

    # Map back to original positions, sort, then convert to 1-based steps
    chosen_steps = np.sort(rev_pos[first_rev_idx]) + 1
    return chosen_steps


def _create_datapoints(length, gap: float = 0.02, feature_cutoff: float = 0.6, target_cutoff: float = 0.9, max_datapoints: int = 100):
    samples = _subsample_logspace(length, gap)
    cutoff = lambda c: int(len(samples) * c)
    pairs = [
        (feature_step, target_step)
        for feature_step in samples[cutoff(feature_cutoff):]
        for target_step in samples[cutoff(target_cutoff):]
        if feature_step < target_step
    ]
    if len(pairs) > max_datapoints:
        pairs = random.sample(pairs, max_datapoints)
    return zip(*pairs)


fs, ts = _create_datapoints(4300, 0.02)

len(fs), len(ts), set(fs), set(ts)


(100,
 100,
 {np.int64(363),
  np.int64(380),
  np.int64(398),
  np.int64(416),
  np.int64(436),
  np.int64(457),
  np.int64(478),
  np.int64(524),
  np.int64(549),
  np.int64(575),
  np.int64(602),
  np.int64(630),
  np.int64(660),
  np.int64(691),
  np.int64(724),
  np.int64(758),
  np.int64(794),
  np.int64(831),
  np.int64(870),
  np.int64(912),
  np.int64(954),
  np.int64(999),
  np.int64(1047),
  np.int64(1096),
  np.int64(1148),
  np.int64(1202),
  np.int64(1318),
  np.int64(1380),
  np.int64(1445),
  np.int64(1513),
  np.int64(1584),
  np.int64(1659),
  np.int64(1737),
  np.int64(1819),
  np.int64(1905),
  np.int64(1995),
  np.int64(2089),
  np.int64(2290),
  np.int64(2398),
  np.int64(2511),
  np.int64(2630),
  np.int64(2754),
  np.int64(2884),
  np.int64(3019),
  np.int64(3311),
  np.int64(4168)},
 {np.int64(2398),
  np.int64(2511),
  np.int64(2630),
  np.int64(2754),
  np.int64(2884),
  np.int64(3019),
  np.int64(3162),
  np.int64(3311),
  np.int64(3467),
  np.int64(3630),
 