# Replicating Burnell Linear Factors

Approximately follows the methodology of Burnell at all. [here](https://arxiv.org/abs/2306.10062).

**NOT** an exact reproduction.

One key point of difference is the cross validation used for selection of number of features.

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Local files
from lsoc.factor import factor, selection, vis, data

In [None]:
url = "https://raw.githubusercontent.com/RyanBurnell/revealing-LLM-capabilities/refs/heads/main/helm_data.csv"
df = pd.read_csv(url)
df.set_index('Model', inplace=True)
meta = df.iloc[:4]
df = df.iloc[4:].astype(float).copy()

# Drop missing values
missing_per_row = df.isna().sum(axis=1)
df = df[missing_per_row <= 5].copy()
missing_per_col = df.isna().sum()
cols_to_drop = df.columns[missing_per_col > 5]
df.drop(columns=cols_to_drop, inplace=True)

disqualified_tasks = [
    "MS_MARCO_(regular)_RR@10",
    "MS_MARCO_(TREC)_NDCG@10",
    "NaturalQuestions_(open-book)_F1",
    "MATH_(chain-of-thoughts)_Equivalent_(chain_of_thought)",
    "Data_imputation_EM",
    "Entity_matching_EM"
]

# impute missing data

imp = IterativeImputer(random_state=0)
imputed = pd.DataFrame(imp.fit_transform(df), columns=df.columns, index=df.index)
imputed.head()


In [None]:
# Holdout model selection
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(imputed)  # is a numpy array

# model
model = factor.PCA()  #FA()
errs = selection.cross_validate(imputed, model, max_factors=7, n_folds=20, repeats=1)
fig = vis.crossval(*errs, method_name=model.name)
fig.show()  # answer - 4 or 5 dimensions

# Yep... factor analysis says 3 factors - consistent with burnell paper

In [None]:
# TODO: get offset working
n_components = 3
from factor_analyzer import FactorAnalyzer
fa_final = FactorAnalyzer(rotation='oblimin', n_factors=n_components)
fa_final.fit(X_scaled)

#W = fa.fit_transform(imputed)
#H = fa.components_
H = fa_final.loadings_.T

component_names = [f"PC{i+1}" for i in range(n_components)]

loading_df = pd.DataFrame(
    data=H.T,
    index=imputed.columns,
    columns=component_names,
)

ld = np.abs(loading_df.values)
main_load = ld.argmax(axis=1)
order = np.argsort(main_load * 100 - ld.max(axis=1))
loading_df = loading_df.iloc[order]

# flip = loading_df.max(axis=1) != loading_df.abs().max(axis=1)
# loading_df[flip] *= -1

fig = vis.heatmap(
    loading_df,
    title="Task Loadings",
    width=6,
    height=12,
    reversescale=True
)
fig.show()

Copyright (c) Gradient Institute and Timaeus. All rights reserved.

Licensed under the Apache 2.0 License.