In [None]:
import numpy as np
import pandas as pd

#### dummy data

### create X, y, groups (by subject id)

In [None]:
from sklearn.preprocessing import LabelEncoder

def get_X_y_groups(X_fname, df_fname, y_column_name, group_column_name):

    '''
    read X (pretrained model embeddings)
    '''
    X = np.load(X_fname)

    '''
    read metadata sheet
    '''
    df = pd.read_csv(df_fname)
    y = df[y_column_name].values
    
    '''
    compile X, y, subject_groups
    '''
    groups_encoder = LabelEncoder().fit(df[group_column_name].tolist())
    subject_groups = df.apply(lambda row: groups_encoder.transform([row[group_column_name]]), axis=1)
    
    assert len(X) == len(y) == len(subject_groups)

    return X, y, subject_groups

### for a given probe_type/feature_type and model_patch_size, train new linear probes for all available model checkpoints 

In [None]:
from utils.pipeline import regression_pipeline_runner

'''
all common/global config across all linear probes
'''

feature_type = "delta_pib"
model_patch_size = "1sec"

model_type = "LinReg"
cv_type = "Simple_KFold"
cv_params = {
     'cv_folds': {"outer": 10},
     'n_jobs': {"outer": 1, "model_fit": 1},
     'random_state': 2509843
}

model_checkpoints = ["epoch10"]

### serial execution

In [None]:
all_probes = {}

for checkpoint in model_checkpoints:

    X, y, groups = get_X_y_groups(
        X_fname=f"{model_patch_size}_{checkpoint}.npy",
        df_fname="metadata.csv",
        y_column_name=feature_type,
        group_column_name="subject_id",
    )
    results = regression_pipeline_runner(X, y, groups, model_type, cv_type, cv_params)
    all_probes[checkpoint] = results[model_type]


### TODO: parallel execution

In [1]:
# ....................

In [None]:
import pickle

date = "5_12_25"

with open(f"{model_patch_size}_{feature_type}_probes_for_all_checkpoints_{date}.pkl", 'wb') as f:
    pickle.dump(all_probes, f)

### plot test score (y-axis) for all model checkpoints (x-axis)