# Pythia's Helm-lite Evals against Burnell's Dataset

Here we analyse custom helm lite evals we collected against the factors we found in the burnell data.

The evals are preliminary because:
* low coverage of the evals we wanted
* high levels of noise due to reduced (n=100 sampling

Prelim analysis suggests this data supports 2 latent factors of similar interpretation to earlier analysis.


In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Local files
from lsoc.factor import factor, selection, vis, data

In [None]:
url = "https://raw.githubusercontent.com/RyanBurnell/revealing-LLM-capabilities/refs/heads/main/helm_data.csv"
df = pd.read_csv(url)

df.set_index('Model', inplace=True)
meta = df.iloc[:4]
df = df.iloc[4:].astype(float).copy()
df.head()

In [None]:
# There is one missing value in the whole matrix
missing_per_row = df.isna().sum(axis=1)
df = df[missing_per_row <= 5].copy()
missing_per_col = df.isna().sum()
cols_to_drop = df.columns[missing_per_col > 5]
df.drop(columns=cols_to_drop, inplace=True)
disqualified_tasks = [
    "MS_MARCO_(regular)_RR@10",
    "MS_MARCO_(TREC)_NDCG@10",
    #"NaturalQuestions_(open-book)_F1",
    "MATH_(chain-of-thoughts)_Equivalent_(chain_of_thought)",
    "Data_imputation_EM",
    "Entity_matching_EM"
]
if "Entity_matching_EM" in df.columns:
    df = df.drop(columns=disqualified_tasks)
# impute missing data
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(random_state=0)
imputed = pd.DataFrame(imp.fit_transform(df), columns=df.columns, index=df.index)

assert imputed.isna().sum().sum() == 0


In [None]:
# Join our 'helm-reduced' dataset with Burnell's selection of features
df_pythia = pd.read_csv(data.default_path + "/evals/pythia_steps.csv")
df_pythia.set_index('Model', inplace=True)

common = sorted(set(df.columns) & set(df_pythia.columns))

df_pythia = df_pythia[common]
imputed = imputed[common]  # the one we actually use
df_pythia

In [None]:
# Holdout model selection
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(imputed)  # is a numpy array

# model
model = factor.PCA()  #FA()
errs = selection.cross_validate(imputed, model, max_factors=7, n_folds=20, repeats=1)
fig = vis.crossval(*errs, method_name=model.name)
fig.show()  # answer - 4 or 5 dimensions

# Yep... factor analysis says 3 factors - consistent with burnell paper

In [None]:
# TODO: get offset working
n_components = 2
from factor_analyzer import FactorAnalyzer
fa_final = FactorAnalyzer(rotation='oblimin', n_factors=n_components)
fa_final.fit(X_scaled)

#W = fa.fit_transform(imputed)
#H = fa.components_
H = fa_final.loadings_.T

component_names = [f"PC{i+1}" for i in range(n_components)]

loading_df = pd.DataFrame(
    data=H.T,
    index=imputed.columns,
    columns=component_names,
)

ld = np.abs(loading_df.values)
main_load = ld.argmax(axis=1)
order = np.argsort(main_load * 100 - ld.max(axis=1))
loading_df = loading_df.iloc[order]

# flip = loading_df.max(axis=1) != loading_df.abs().max(axis=1)
# loading_df[flip] *= -1

fig = vis.heatmap(
    loading_df,
    title="Task Loadings",
    width=6,
    height=12,
    reversescale=True
)
fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
models = sorted(set(v.split("/")[0] for v in df_pythia.index.values))
# Create a figure with two subplots stacked vertically
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                   vertical_spacing=0.1,
                   subplot_titles=('Z1 (Burnell "reasoning")', 'Z2 (Burnell "comprehension")'))

# Set a different color for each model using Plotly's default colors
for i, model in enumerate(models):
    # Get the model entries
    get = [v for v in df_pythia.index.values if model in v]
    step = [int(g.split("/")[1]) for g in get]
    dfp = df_pythia.loc[get]
    
    # Transform the data to get scores
    scores = fa_final.transform(dfp)  # scores is n x n_components=2
    
    # Sort by step to ensure lines connect points in the right order
    sorted_indices = np.argsort(step)
    step_sorted = np.array(step)[sorted_indices]
    scores_sorted = scores[sorted_indices]
    
    # Plot the first component in the top subplot
    fig.add_trace(
        go.Scatter(
            x=step_sorted,
            y=scores_sorted[:, 0],
            mode='lines+markers',
            name=model,
            legendgroup=model,
            showlegend=True
        ),
        row=1, col=1
    )
    
    # Plot the second component in the bottom subplot
    fig.add_trace(
        go.Scatter(
            x=step_sorted,
            y=scores_sorted[:, 1],
            mode='lines+markers',
            name=model,
            legendgroup=model,
            showlegend=False  # Don't repeat in legend
        ),
        row=2, col=1
    )

# Update layout
fig.update_layout(
    title_text="Model Scores vs. Training Steps",
    height=800,
    width=1000,
    # legend=dict(
    #     orientation="h",
    #     yanchor="bottom",
    #     y=1.02,
    #     xanchor="right",
    #     x=1
    # )
)

# Set log scale for x-axis on both subplots
fig.update_xaxes(type="log", title_text="Steps", row=2, col=1)
fig.update_xaxes(type="log", row=1, col=1)

# Add y-axis titles
fig.update_yaxes(title_text="Z1 (Burnell's 'reasoning')", row=1, col=1)
fig.update_yaxes(title_text="Z2 (Burnell's 'comprehension')", row=2, col=1)

# Add grid for better readability
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig.show()


Copyright (c) Gradient Institute and Timaeus. All rights reserved.

Licensed under the Apache 2.0 License.
