In [1]:
import numpy as np
import sys
sys.path.append('/home3/ebrahim2/beyond-brainscore/run_reg_scripts/')
from helper_funcs import combine_MSE_across_folds
from matplotlib import pyplot as plt

In [4]:
import numpy as np

def _mean_across_participants(values_per_unit, participant_info):
    """
    values_per_unit: 1D array, length = # brain units (e.g., R² per unit)
    participant_info: 1D array, length = # brain units, giving participant id per unit

    Returns:
      overall_mean: scalar mean of per-participant means (ignores NaNs)
      per_participant_means: dict {participant_id: mean_value}
    """
    vals = np.asarray(values_per_unit)
    parts = np.asarray(participant_info)
    per_part_means = {}
    for pid in np.unique(parts):
        mask = (parts == pid)
        if np.any(mask):
            per_part_means[pid] = np.clip(np.nanmean(vals[mask]), 0, np.inf)
    overall = np.nanmean(list(per_part_means.values())) if per_part_means else np.nan
    return overall, per_part_means


def compare_perf_when_using_val(
    dataset,
    y_true,
    test_fold_idx,
    layer_range,
    mse_intercept,
    model_name,
    model_name_y_hat,
    participant_info,
    shuffled=False, 
    return_participant_means=False
):
    """
    participant_info: shape (# brain units,), contains the participant id
                      for the corresponding brain unit.
    """
    val_scores_across_layers = []
    test_score = []
    per_part_mean_test = []

    base_path = f'/data/LLMs/brainscore/results_{dataset}/'
    if shuffled:
        base_path += 'shuffled/'

    # loop through model layers/hyperparameters
    for ln in layer_range:
        # replace -1 with the layer name
        model_name_ln = model_name.replace('-1', f'{ln}')

        # val_scores: (num_outer_folds x num_units); we keep your original averaging across units
        val_scores = np.mean(
            np.load(f'{base_path}{dataset}_{model_name_ln}.npz')['val_scores'],
            axis=-1
        )
        val_scores_across_layers.append(val_scores)

        # --- TEST (on test set) ---
        # out_of_sample_r2: (num_units)
        oos_r2_units = np.load(f'{base_path}{dataset}_{model_name_ln}.npz')['out_of_sample_r2']
        # mean within each participant, then across participants
        layer_mean, per_part_mean = _mean_across_participants(oos_r2_units, participant_info)
        test_score.append(layer_mean)
        per_part_mean_test.append(per_part_mean)

    # Pick best layer per fold from validation
    val_scores_across_layers_stacked = np.vstack(val_scores_across_layers)  # (num_layers x num_folds)
    best_val_layers = np.argmax(val_scores_across_layers_stacked, axis=0)   # (num_folds,)
    stacked_mse = []

    for idx, bvl_idx in enumerate(best_val_layers):
        # compute mse for the best layer selected from that outer fold
        bvl = layer_range[bvl_idx]
        model_name_bvl = model_name_y_hat.replace('-1', f'{bvl}')
        y_hat = np.load(f'{base_path}{dataset}_{model_name_bvl}.npz')['y_hat']
        mse = (y_true - y_hat) ** 2

        # only take the mse values for the outer fold range
        start_idx = test_fold_idx[idx]
        end_idx = test_fold_idx[idx + 1]
        stacked_mse.append(mse[start_idx:end_idx])

    # np.vstack(stacked_mse): (time x brain units); mean across time -> (brain units,)
    stacked_mse_np = np.mean(np.vstack(stacked_mse), axis=0)
    out_of_sample_r2_by_val = 1 - stacked_mse_np / mse_intercept  # (brain units,)

    # --- PRINTS with per-participant aggregation ---
    val_overall_mean, per_part_means_val = _mean_across_participants(out_of_sample_r2_by_val, participant_info)
    print("Performance when using val to select best layer/hparam (per-participant mean, then across participants):",
          val_overall_mean)

    print("Performance when using test set (best layer by test R², per-participant mean, then across participants):",
          np.nanmax(test_score))
    
    if return_participant_means:
        best_layer = np.nanargmax(test_score)
        return test_score, val_scores_across_layers_stacked, per_part_mean_test[best_layer], per_part_means_val
        

    return test_score, val_scores_across_layers_stacked


In [19]:
y_true_shuffled = np.load('/data/LLMs/brainscore/results_pereira/shuffled/y_test_ordered_384_lang.npy')
test_fold_size_shuffled = np.load('/data/LLMs/brainscore/results_pereira/shuffled/test_fold_size_384.npy')
test_fold_idx_shuffled = np.hstack(([0], np.cumsum(test_fold_size_shuffled)))
mse_intercept_shuffled = np.mean(np.load('/data/LLMs/brainscore/results_pereira/shuffled/mse_intercept_384_lang.npy'),axis=0)
subjects_384 = np.load('/data/LLMs/data_processed/pereira/dataset/subjects_384_lang.npy', allow_pickle=True)


print("GPT2-XL, SHUFFLED, PEREIRA 384")
ts_gpt, vs_gpt, pt_gpt, pv_gpt = compare_perf_when_using_val('pereira', y_true_shuffled, 
                            test_fold_idx_shuffled, 
                            np.arange(49), 
                            mse_intercept_shuffled,
                            'gpt2-xl_layer_-1_1_384_m0', 'gpt2-xl_layer_-1_1_384_m0', subjects_384, shuffled=True, return_participant_means=True)

print("OASM, SHUFFLED, PEREIRA 384")
ts, vs, pt_oasm, pv_oasm = compare_perf_when_using_val('pereira', y_true_shuffled, 
                            test_fold_idx_shuffled, 
                            np.round(np.arange(0.1,4.8,0.1),3), 
                            mse_intercept_shuffled,
                            'OASM-all-sigma_-1_1_384', 'OASM-all-sigma_-1_1_384', subjects_384, shuffled=True, return_participant_means=True)

GPT2-XL, SHUFFLED, PEREIRA 384


  out_of_sample_r2_by_val = 1 - stacked_mse_np / mse_intercept  # (brain units,)


Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.1715490221977234
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.17154901736921307
OASM, SHUFFLED, PEREIRA 384
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.22679586542977226
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.22747532839740478


In [20]:
y_true_shuffled = np.load('/data/LLMs/brainscore/results_pereira/shuffled/y_test_ordered_243_lang.npy')
test_fold_size_shuffled = np.load('/data/LLMs/brainscore/results_pereira/shuffled/test_fold_size_243.npy')
test_fold_idx_shuffled = np.hstack(([0], np.cumsum(test_fold_size_shuffled)))
mse_intercept_shuffled = np.mean(np.load('/data/LLMs/brainscore/results_pereira/shuffled/mse_intercept_243_lang.npy'),axis=0)
subjects_243 = np.load('/data/LLMs/data_processed/pereira/dataset/subjects_243_lang.npy', allow_pickle=True)

print("GPT2-XL, SHUFFLED, PEREIRA 243")
ts_gpt, vs_gpt, pt_gpt_243, pv_gpt_243 = compare_perf_when_using_val('pereira', y_true_shuffled, 
                            test_fold_idx_shuffled, 
                            np.arange(49), 
                            mse_intercept_shuffled,
                            'gpt2-xl_layer_-1_1_243_m2', 'gpt2-xl_layer_-1_1_243_m2', subjects_243, shuffled=True, return_participant_means=True)

print("OASM, SHUFFLED, PEREIRA 243")
ts, vs, pt_oasm_243, pv_oasm_243 = compare_perf_when_using_val('pereira', y_true_shuffled, 
                            test_fold_idx_shuffled, 
                            np.round(np.arange(0.1,4.8,0.1),3), 
                            mse_intercept_shuffled,
                            'OASM-all-sigma_-1_1_243', 'OASM-all-sigma_-1_1_243', subjects_243, shuffled=True, return_participant_means=True)

GPT2-XL, SHUFFLED, PEREIRA 243
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.12202912320693333
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.12202912389656732
OASM, SHUFFLED, PEREIRA 243
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.18252869447072348
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.18359714245434736


In [23]:
def merge_dicts_mean(d1, d2, message):
    """
    Merge two dicts by taking the mean of values for shared keys.
    If a key is unique to one dict, keep its value.
    
    Args:
        d1 (dict): First dictionary with numeric values
        d2 (dict): Second dictionary with numeric values
    
    Returns:
        dict: Merged dictionary
    """
    merged = {}
    all_keys = set(d1.keys()) | set(d2.keys())
    
    for k in all_keys:
        if k in d1 and k in d2:
            merged[k] = (d1[k] + d2[k]) / 2
        elif k in d1:
            merged[k] = d1[k]
        else:
            merged[k] = d2[k]
    
    print(f"{message}: {np.mean(list(merged.values()))}")

merge_dicts_mean(pv_oasm, pv_oasm_243, "OASM validation")
merge_dicts_mean(pv_gpt, pv_gpt_243, "GPT2 Validation Shuffled")

merge_dicts_mean(pt_oasm, pt_oasm_243, "OASM test")
merge_dicts_mean(pt_gpt, pt_gpt_243, "GPT2 test Shuffled")



OASM validation: 0.2146851658821106
GPT2 Validation Shuffled: 0.15399813018739222
OASM test: 0.21566910575181578
GPT2 test Shuffled: 0.15399812704523427


In [24]:
y_true = np.load('/data/LLMs/brainscore/results_pereira/y_test_ordered_384_lang.npy')
test_fold_size = np.load('/data/LLMs/brainscore/results_pereira/test_fold_size_384.npy')
test_fold_idx = np.hstack(([0], np.cumsum(test_fold_size)))
mse_intercept = np.mean(np.load('/data/LLMs/brainscore/results_pereira/mse_intercept_384_lang.npy'),axis=0)


print("GPT2-XL,PEREIRA 384")
ts_gpt, vs_gpt, pt_gpt, pv_gpt = compare_perf_when_using_val('pereira', y_true, 
                            test_fold_idx, 
                            np.arange(49), 
                            mse_intercept,
                            'gpt2-xl_layer_-1_1_384', 'gpt2-xl_layer_-1_1_384', subjects_384, shuffled=False, return_participant_means=True)

print("Position, PEREIRA 384")
ts, vs, pt_pos, pv_pos = compare_perf_when_using_val('pereira', y_true, 
                            test_fold_idx, 
                            np.round(np.arange(0.1,4.8,0.1),3), 
                            mse_intercept,
                            'position_layer_-1_1_384', 'position_layer_-1_1_384', subjects_384, shuffled=False, return_participant_means=True)

GPT2-XL,PEREIRA 384


  out_of_sample_r2_by_val = 1 - stacked_mse_np / mse_intercept  # (brain units,)


Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.027326632084117994
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.029126123267106334
Position, PEREIRA 384
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.011171347633030059
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.011746037433222042


In [26]:
y_true= np.load('/data/LLMs/brainscore/results_pereira/y_test_ordered_243_lang.npy')
test_fold_size = np.load('/data/LLMs/brainscore/results_pereira/test_fold_size_243.npy')
test_fold_idx = np.hstack(([0], np.cumsum(test_fold_size)))
mse_intercept = np.mean(np.load('/data/LLMs/brainscore/results_pereira/mse_intercept_243_lang.npy'),axis=0)


print("GPT2-XL,PEREIRA 243")
ts_gpt, vs_gpt, pt_gpt_243, pv_gpt_243 = compare_perf_when_using_val('pereira', y_true, 
                            test_fold_idx, 
                            np.arange(49), 
                            mse_intercept,
                            'gpt2-xl_layer_-1_1_243', 'gpt2-xl_layer_-1_1_243', subjects_243, shuffled=False, return_participant_means=True)

print("Position, PEREIRA 243")
ts, vs, pt_pos_243, pv_pos_243 = compare_perf_when_using_val('pereira', y_true, 
                            test_fold_idx, 
                            np.round(np.arange(0.1,4.8,0.1),3), 
                            mse_intercept,
                            'position_layer_-1_1_243', 'position_layer_-1_1_243', subjects_243, shuffled=False, return_participant_means=True)

GPT2-XL,PEREIRA 243
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.02026554204834004
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.02285006216048691
Position, PEREIRA 243
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.005398740021822353
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.005938581932922522


In [27]:

merge_dicts_mean(pv_pos, pv_pos_243, "POS validation")
merge_dicts_mean(pv_gpt, pv_gpt_243, "GPT2 Validation")

merge_dicts_mean(pt_pos, pt_pos_243, "POS test")
merge_dicts_mean(pt_gpt, pt_gpt_243, "GPT2 test")

POS validation: 0.009187782190565486
GPT2 Validation: 0.021470996527932586
POS test: 0.009743077554318092
GPT2 test: 0.02321203350659032


In [114]:
results_dir = "results_fedorenko"  # or any other directory you want

y_true_shuffled = np.load(f"/data/LLMs/brainscore/{results_dir}/shuffled/y_test_ordered.npy")
test_fold_size_shuffled = np.load(f"/data/LLMs/brainscore/{results_dir}/shuffled/test_fold_size.npy")
test_fold_idx_shuffled = np.hstack(([0], np.cumsum(test_fold_size_shuffled)))
mse_intercept_shuffled = np.mean(
    np.load(f"/data/LLMs/brainscore/{results_dir}/shuffled/mse_intercept.npy"), axis=0
)

subjects_fed = np.load('/data/LLMs/data_processed/fedorenko/dataset/subjects.npy', allow_pickle=True)


print("GPT2-XL, SHUFFLED, FEDORENKO")
ts_gpt, vs_gpt = compare_perf_when_using_val('fedorenko', y_true_shuffled, 
                            test_fold_idx_shuffled, 
                            np.arange(49), 
                            mse_intercept_shuffled,
                            'gpt2-xl_layer_-1_1', 'gpt2-xl_layer_-1_1', subjects_fed, shuffled=True)


print("OASM, SHUFFLED, FEDORENKO")
ts_gpt, vs_gpt = compare_perf_when_using_val('fedorenko', y_true_shuffled, 
                            test_fold_idx_shuffled, 
                            np.round(np.arange(0.1,4.8,0.1),3), 
                            mse_intercept_shuffled,
                            'OASM-all-sigma_-1_1', 'OASM-all-sigma_-1_1', subjects_fed, shuffled=True)


GPT2-XL, SHUFFLED, FEDORENKO
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.08362601026892662
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.0854775407576281
OASM, SHUFFLED, FEDORENKO
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.09688139706850052
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.09788830144179941


In [113]:
results_dir = "results_fedorenko"  # or any other directory you want
shuffled = False  # set True/False here

# decide subfolder based on shuffled flag
subdir = "shuffled" if shuffled else ""

# add "/" only if needed
folder = f"/{subdir}" if subdir else ""

y_true = np.load(f"/data/LLMs/brainscore/{results_dir}{folder}/y_test_ordered.npy")
test_fold_size = np.load(f"/data/LLMs/brainscore/{results_dir}{folder}/test_fold_size.npy")
test_fold_idx = np.hstack(([0], np.cumsum(test_fold_size)))
mse_intercept = np.mean(
    np.load(f"/data/LLMs/brainscore/{results_dir}{folder}/mse_intercept.npy"), axis=0
)

subjects_fed = np.load('/data/LLMs/data_processed/fedorenko/dataset/subjects.npy', allow_pickle=True)

print(f"GPT2-XL, {'SHUFFLED' if shuffled else 'NOT SHUFFLED'}, FEDORENKO")
ts_gpt, vs_gpt = compare_perf_when_using_val(
    'fedorenko',
    y_true,
    test_fold_idx,
    np.arange(49),
    mse_intercept,
    'gpt2-xl_layer_-1_1',
    'gpt2-xl_layer_-1_1',
    subjects_fed,
    shuffled=shuffled
)

print(f"POS, {'SHUFFLED' if shuffled else 'NOT SHUFFLED'}, FEDORENKO")
ts_gpt, vs_gpt = compare_perf_when_using_val(
    'fedorenko',
    y_true,
    test_fold_idx,
    np.round(np.arange(0.1, 4.8, 0.1), 3),
    mse_intercept,
    'pos_layer_-1_1',
    'pos_layer_-1_1',
    subjects_fed,
    shuffled=shuffled
)


GPT2-XL, NOT SHUFFLED, FEDORENKO
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.048867734521627425
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.05085057621718514
POS, NOT SHUFFLED, FEDORENKO
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.04647891782224178
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.047141644278820984


In [111]:
results_dir = "results_blank"  # or any other directory you want

y_true_shuffled = np.load(f"/data/LLMs/brainscore/{results_dir}/shuffled/y_test_ordered.npy")
test_fold_size_shuffled = np.load(f"/data/LLMs/brainscore/{results_dir}/shuffled/test_fold_size.npy")
test_fold_idx_shuffled = np.hstack(([0], np.cumsum(test_fold_size_shuffled)))
mse_intercept_shuffled = np.mean(
    np.load(f"/data/LLMs/brainscore/{results_dir}/shuffled/mse_intercept.npy"), axis=0
)
subjects_blank = np.load('/data/LLMs/data_processed/blank/dataset/subjects.npy', allow_pickle=True)



print("GPT2-XL, SHUFFLED, BLANK")
_, _ = compare_perf_when_using_val('blank', y_true_shuffled, 
                            test_fold_idx_shuffled, 
                            np.arange(48), 
                            mse_intercept_shuffled,
                            'gpt2-xl_layer_-1_1', 'gpt2-xl_layer_-1_1', subjects_blank, shuffled=True)




print("OASM, SHUFFLED, BLANK")
_, _ = compare_perf_when_using_val('blank', y_true_shuffled, 
                            test_fold_idx_shuffled, 
                            np.round(np.arange(0.1,4.8,0.1),3), 
                            mse_intercept_shuffled,
                            'OASM-all-sigma_-1_1', 'OASM-all-sigma_-1_1', subjects_blank, shuffled=True)

GPT2-XL, SHUFFLED, BLANK
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.0017078061355277896
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.002857624729844399
OASM, SHUFFLED, BLANK
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.2747193992137909
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.27495304508994145


In [109]:
results_dir = "results_blank"  # or any other directory you want
shuffled = False  # set True/False here

subjects_blank = np.load('/data/LLMs/data_processed/blank/dataset/subjects.npy', allow_pickle=True)

# decide subfolder based on shuffled flag
subdir = "shuffled" if shuffled else ""

# add "/" only if needed
folder = f"/{subdir}" if subdir else ""

y_true = np.load(f"/data/LLMs/brainscore/{results_dir}{folder}/y_test_ordered.npy")
test_fold_size = np.load(f"/data/LLMs/brainscore/{results_dir}{folder}/test_fold_size.npy")
test_fold_idx = np.hstack(([0], np.cumsum(test_fold_size)))
mse_intercept = np.mean(
    np.load(f"/data/LLMs/brainscore/{results_dir}{folder}/mse_intercept.npy"), axis=0
)

print(f"GPT2-XL, {'SHUFFLED' if shuffled else 'NOT SHUFFLED'}, BLANK")
ts_gpt, vs_gpt = compare_perf_when_using_val(
    'blank',
    y_true,
    test_fold_idx,
    np.arange(49),
    mse_intercept,
    'gpt2-xl_layer_-1_1',
    'gpt2-xl_layer_-1_1',
    subjects_blank,
    shuffled=shuffled
)

print(f"POS, {'SHUFFLED' if shuffled else 'NOT SHUFFLED'}, BLANK")
ts_gpt, vs_gpt = compare_perf_when_using_val(
    'blank',
    y_true,
    test_fold_idx,
    np.round(np.arange(3,51), 3),
    mse_intercept,
    'pos_layer_-1_1',
    'pos_layer_-1_1',
    subjects_blank,
    shuffled=shuffled
)


GPT2-XL, NOT SHUFFLED, BLANK
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.00013014674186706543
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.00036519853364957257
POS, NOT SHUFFLED, BLANK
Performance when using val to select best layer/hparam (per-participant mean, then across participants): 0.006913880538195372
Performance when using test set (best layer by test R², per-participant mean, then across participants): 0.008489313158332183
