In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import torch
from tqdm.auto import tqdm
from sklearn.linear_model import RidgeCV
from utils import *

device='cpu'

def logit(x, eps=.001):
    c = 0 #np.min(x)
    x2 = (np.array(x)-c)/(1-c)
    return np.log((x2+eps)/(1-x2+eps))

data_path = '/llmthonskdir/felipe/download_openllmlb/'

mmlu_subs = ['hendrycksTest-abstract_algebra', 'hendrycksTest-anatomy', 'hendrycksTest-astronomy', 'hendrycksTest-business_ethics',
             'hendrycksTest-clinical_knowledge', 'hendrycksTest-college_biology', 'hendrycksTest-college_chemistry',
             'hendrycksTest-college_computer_science', 'hendrycksTest-college_mathematics', 'hendrycksTest-college_medicine', 'hendrycksTest-college_physics', 'hendrycksTest-computer_security', 'hendrycksTest-conceptual_physics', 'hendrycksTest-econometrics', 'hendrycksTest-electrical_engineering', 'hendrycksTest-elementary_mathematics', 'hendrycksTest-formal_logic', 'hendrycksTest-global_facts', 'hendrycksTest-high_school_biology', 'hendrycksTest-high_school_chemistry', 'hendrycksTest-high_school_computer_science', 'hendrycksTest-high_school_european_history', 'hendrycksTest-high_school_geography', 'hendrycksTest-high_school_government_and_politics', 'hendrycksTest-high_school_macroeconomics', 'hendrycksTest-high_school_mathematics', 'hendrycksTest-high_school_microeconomics', 'hendrycksTest-high_school_physics', 'hendrycksTest-high_school_psychology', 'hendrycksTest-high_school_statistics', 'hendrycksTest-high_school_us_history', 'hendrycksTest-high_school_world_history', 'hendrycksTest-human_aging', 'hendrycksTest-human_sexuality', 'hendrycksTest-international_law', 'hendrycksTest-jurisprudence', 'hendrycksTest-logical_fallacies', 'hendrycksTest-machine_learning', 'hendrycksTest-management', 'hendrycksTest-marketing', 'hendrycksTest-medical_genetics', 'hendrycksTest-miscellaneous', 'hendrycksTest-moral_disputes', 'hendrycksTest-moral_scenarios', 'hendrycksTest-nutrition', 'hendrycksTest-philosophy', 'hendrycksTest-prehistory', 'hendrycksTest-professional_accounting', 'hendrycksTest-professional_law', 'hendrycksTest-professional_medicine', 'hendrycksTest-professional_psychology', 'hendrycksTest-public_relations', 'hendrycksTest-security_studies', 'hendrycksTest-sociology', 'hendrycksTest-us_foreign_policy', 'hendrycksTest-virology', 'hendrycksTest-world_religions']

In [2]:
data = pd.read_csv('data/base_llm_benchmark_eval.csv')
models_scaling = list(data.Model)
models_scaling = [filter(m) for m in models_scaling]

with open(data_path+'scaling_laws/old_leaderboard_processed_20240630.pickle', 'rb') as handle:
    lb_data = pickle.load(handle)
models_lb = lb_data['hendrycksTest-abstract_algebra']['models']
models_lb = [filter(m) for m in models_lb]

accs_mmlu = [lb_data[s]['correctness'].mean(1) for s in mmlu_subs]
accs_mmlu = np.vstack(accs_mmlu).T

In [3]:
with open(data_path+'/old_leaderboard_processed_20240630.pickle', 'rb') as handle:
    full_lb_data = pickle.load(handle)
asymptot = {}
for s in full_lb_data.keys():
    asymptot[s] = np.percentile(full_lb_data[s]['correctness'].mean(-1), 1)

In [4]:
for j,s in enumerate(mmlu_subs):
    posics = []
    for m in models_scaling:
        if m in models_lb:
            posics.append(np.array(np.array(models_lb)==m).argmax())
        else:
            posics.append(-1)
    
    y = []
    for posic in posics:
        if posic==-1:
            y.append(np.nan)
        else:
            y.append(accs_mmlu[posic,j])
            
    data[s] = y

In [5]:
fam_encoder = LabelEncoder()
fam_encoder.fit(data['Model Family'])
data['T'] = data['Pretraining Data Size (T)']
data['S'] = data['Model Size (B)']
data['F'] = data['FLOPs (1E21)']
data['family'] = data['Model Family']
data = data.sort_values(by=['family','S']).reset_index(drop=True)
data['logT'] = np.log(data['T'])
data['logS'] = np.log(data['S'])
data['logF'] = np.log(data['F'])
data['logS*logT'] = data['logS']*data['logT']
data = data[['family','logT','logS','logF','logS*logT','ARC-C','HellaSwag','Winograd','TruthfulQA','GSM8K']+mmlu_subs] #,'XWinograd','HumanEval'

In [9]:
data = data.dropna(how='any')
unique_families, counts_families = np.unique(data.family, return_counts=True)
avail_families = unique_families[counts_families>=2]

In [11]:
test_family = avail_families[0]

In [129]:
data_train = data.loc[data.family != test_family]
data_test = data.loc[data.family == test_family]
data_train = pd.concat((data_test.iloc[:1],data_train), axis=0).reset_index(drop=True)
data_test = data_test.iloc[1:].reset_index(drop=True)

###
Y_train = torch.tensor(np.array(data_train.loc[:,['ARC-C','HellaSwag','Winograd','TruthfulQA','GSM8K']+mmlu_subs]))
X_train = np.array(data_train.loc[:,['logT','logS','logS*logT']])
X_train = torch.tensor(X_train).double()
Y_test = torch.tensor(np.array(data_test.loc[:,['ARC-C','HellaSwag','Winograd','TruthfulQA','GSM8K']+mmlu_subs]))
X_test = np.array(data_test.loc[:,['logT','logS','logS*logT']])
X_test = torch.tensor(X_test).double()
D_train = torch.tensor(np.array(pd.get_dummies(np.array(data_train.family)))).double()
D_test = torch.tensor(np.vstack([D_train[0,:].numpy() for _ in range(Y_test.shape[0])])).double()

In [130]:
X_train.shape

torch.Size([83, 3])

In [152]:
def k(x, h):
    return torch.exp(-(x/h)**2)

In [153]:
Y_train.shape

torch.Size([83, 62])

In [156]:

h.shape

torch.Size([1, 62, 1])

In [159]:
mask = torch.ones(K.shape, dtype=bool)
for i in range(mask.shape[0]):
    mask[i,:,i] = False

In [172]:
thetas = torch.nn.Parameter(torch.normal(0, .1, size=(D_train.shape[1],1), dtype=torch.float64, device=device))
beta = torch.nn.Parameter(torch.normal(0, .1, size=(X_train.shape[1],1), dtype=torch.float64, device=device))
h=.1*torch.nn.Parameter(torch.ones((1,Y_train.shape[1],1), dtype=torch.float64, device=device))

#training loop here

Z = (X_train@beta)+(D_train@thetas)
Z = Z@torch.ones((1,Y_train.shape[1])).double()
K = k(Z[:,:,None]-Z.T[None,:,:], h)

loss = 0
for i in range(mask.shape[0]):
    Y_hat = ((((K*Y_train.T[None,:,:])[i])*mask[i]).sum(1)/((K*Y_train.T[None,:,:])[i]).sum(1))
    loss += (Y_train[i]-Y_hat)**2

In [179]:
import torch
import torch.optim as optim

# Assume these variables are already defined
# thetas, beta, h, X_train, D_train, Y_train, mask, device

thetas = torch.nn.Parameter(torch.normal(0, .1, size=(D_train.shape[1],1), dtype=torch.float64, device=device))
beta = torch.nn.Parameter(torch.normal(0, .1, size=(X_train.shape[1],1), dtype=torch.float64, device=device))
h=torch.nn.Parameter(torch.ones((1,Y_train.shape[1],1), dtype=torch.float64, device=device))

# Initialize the Adam optimizer
optimizer = optim.Adam([thetas, beta, h], lr=.1)

# Number of epochs
num_epochs = 100

# List to track the loss at each epoch
losses = []

for epoch in range(num_epochs):
    optimizer.zero_grad()  # Zero the gradients before backward pass
    
    # Forward pass
    Z = (X_train @ beta) + (D_train @ thetas)
    Z = Z @ torch.ones((1, Y_train.shape[1])).double()
    K = k(Z[:, :, None] - Z.T[None, :, :], h)
    
    # Compute the loss
    loss = 0
    for i in range(mask.shape[0]):
        Y_hat = ((((K * Y_train.T[None, :, :])[i]) * mask[i]).sum(1) / 
                 ((K * Y_train.T[None, :, :])[i]).sum(1))
        loss += (Y_train[i] - Y_hat).pow(2).sum()
    
    # Backward pass and optimization step
    loss.backward()
    optimizer.step()
    
    # Track the loss
    losses.append(loss.item())
    
    # Print the loss for the current epoch
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# losses now contain the loss value at each epoch


Epoch 1/100, Loss: 1961.9933985484663
Epoch 2/100, Loss: 1918.174402303216
Epoch 3/100, Loss: 1852.5807646740354
Epoch 4/100, Loss: 1763.647459113954
Epoch 5/100, Loss: 1653.818546022617
Epoch 6/100, Loss: 1523.1037441670803
Epoch 7/100, Loss: 1376.7680140414548
Epoch 8/100, Loss: 1216.2174979198976
Epoch 9/100, Loss: 998.7162701570958
Epoch 10/100, Loss: 710.7936103057709
Epoch 11/100, Loss: 481.2625826362743
Epoch 12/100, Loss: 776.7947197484353
Epoch 13/100, Loss: 671.6676085265377
Epoch 14/100, Loss: 560.678172986824
Epoch 15/100, Loss: 595.9643275539843
Epoch 16/100, Loss: 638.2424866538337
Epoch 17/100, Loss: 662.0370614323286
Epoch 18/100, Loss: 666.7912564006713
Epoch 19/100, Loss: 661.6483355428
Epoch 20/100, Loss: 656.6941756157765
Epoch 21/100, Loss: 660.681111130454
Epoch 22/100, Loss: 659.640559832386
Epoch 23/100, Loss: 649.0773018193964
Epoch 24/100, Loss: 628.3424793030186
Epoch 25/100, Loss: 605.1577066426185
Epoch 26/100, Loss: 581.8776673964228
Epoch 27/100, Loss: 56

In [184]:
import torch
import torch.optim as optim

# Assume these variables are already defined
# thetas, beta, h, X_train, D_train, Y_train, mask, device

thetas = torch.nn.Parameter(torch.normal(0, .1, size=(D_train.shape[1],1), dtype=torch.float64, device=device))
beta = torch.nn.Parameter(torch.normal(0, .1, size=(X_train.shape[1],1), dtype=torch.float64, device=device))
h = torch.nn.Parameter(torch.ones((1,Y_train.shape[1],1), dtype=torch.float64, device=device))

# Initialize the Adam optimizer
optimizer = optim.Adam([thetas, beta, h], lr=0.1)

# Number of epochs
num_epochs = 8

# Lists to track the loss and MAE at each epoch
losses = []
maes = []

for epoch in range(num_epochs):
    optimizer.zero_grad()  # Zero the gradients before backward pass
    
    # Forward pass
    Z = (X_train @ beta) + (D_train @ thetas)
    Z = Z @ torch.ones((1, Y_train.shape[1])).double()
    K = k(Z[:, :, None] - Z.T[None, :, :], h)
    
    # Compute the loss and MAE
    loss = 0
    mae = 0
    for i in range(mask.shape[0]):
        Y_hat = ((((K * Y_train.T[None, :, :])[i]) * mask[i]).sum(1) / 
                 ((K * Y_train.T[None, :, :])[i]).sum(1))
        loss += (Y_train[i] - Y_hat).pow(2).mean()
        mae += torch.abs(Y_train[i] - Y_hat).mean()
    
    # Backward pass and optimization step
    loss.backward()
    optimizer.step()
    
    # Track the loss and MAE
    losses.append(loss.item()/ Y_train.numel())
    maes.append(mae.item() / Y_train.numel())  # Average MAE across all elements
    
    # Print the loss and MAE for the current epoch
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}, MAE: {maes[-1]}')

# losses and maes now contain the loss and MAE values at each epoch


Epoch 1/8, Loss: 31.473319831205455, MAE: 0.00935490070458177
Epoch 2/8, Loss: 30.566449114215132, MAE: 0.009160511832351973
Epoch 3/8, Loss: 29.41860551448117, MAE: 0.008929719264447232
Epoch 4/8, Loss: 28.046466475321424, MAE: 0.008671480869101453
Epoch 5/8, Loss: 26.28469016783835, MAE: 0.008329395175142687
Epoch 6/8, Loss: 24.091191853094287, MAE: 0.007871639494972653
Epoch 7/8, Loss: 21.494921377501303, MAE: 0.0073034614541404525
Epoch 8/8, Loss: 18.683643231975825, MAE: 0.0067661822345306604


In [186]:
Y_hat

tensor([0.6541, 0.6617, 0.6543, 0.7101, 0.7642, 0.6744, 0.6417, 0.6379, 0.6524,
        0.6362, 0.6326, 0.6702, 0.6604, 0.7055, 0.6473, 0.6466, 0.6692, 0.6438,
        0.6502, 0.6528, 0.6461, 0.6810, 0.6134, 0.6404, 0.6498, 0.6607, 0.6397,
        0.6409, 0.6426, 0.6460, 0.7045, 0.6461, 0.6746, 0.6466, 0.6514, 0.6286,
        0.6467, 0.6395, 0.6422, 0.6502, 0.6380, 0.6379, 0.6543, 0.6589, 0.6595,
        0.6338, 0.6445, 0.6496, 0.5891, 0.6401, 0.6447, 0.6423, 0.6475, 0.6369,
        0.6178, 0.6269, 0.6608, 0.6521, 0.6523, 0.6463, 0.6577, 0.6566],
       dtype=torch.float64, grad_fn=<DivBackward0>)

In [188]:
Y_train[i]

tensor([0.6578, 0.8206, 0.8287, 0.4260, 0.3487, 0.4200, 0.7333, 0.8947, 0.7800,
        0.8113, 0.8819, 0.4700, 0.6300, 0.4000, 0.7110, 0.5490, 0.8000, 0.7745,
        0.5614, 0.7724, 0.6455, 0.5159, 0.5700, 0.8968, 0.6502, 0.8200, 0.8485,
        0.9293, 0.9741, 0.7821, 0.3815, 0.8403, 0.4636, 0.9211, 0.6713, 0.9069,
        0.9072, 0.7937, 0.8626, 0.9174, 0.8889, 0.8834, 0.5804, 0.8932, 0.9402,
        0.8800, 0.9029, 0.8035, 0.6760, 0.8464, 0.8199, 0.8364, 0.6135, 0.5913,
        0.8088, 0.8203, 0.7273, 0.8204, 0.8806, 0.9200, 0.5723, 0.8772],
       dtype=torch.float64)

In [43]:
h.shape

torch.Size([2])

In [6]:
def sigmoid(z, use_torch=True):
    if use_torch:
        return torch.nn.Sigmoid()(z)
    else:
        return 1/(1+np.exp(-z))

In [7]:
d = 10

epochs=100
tol=1e-8

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression


In [8]:
data = data.dropna(how='any')
unique_families, counts_families = np.unique(data.family, return_counts=True)
avail_families = unique_families[counts_families>=2]

In [12]:
preds = []
preds_baseline = []
preds_baseline2 = []
preds_rf = []
ys = []

for test_family in tqdm(avail_families):
    
    data_train = data.loc[data.family != test_family]
    data_test = data.loc[data.family == test_family]
    data_train = pd.concat((data_test.iloc[:1],data_train), axis=0).reset_index(drop=True)
    data_test = data_test.iloc[1:].reset_index(drop=True)
    
    ###
    Y_train = torch.tensor(logit(np.array(data_train.loc[:,['ARC-C','HellaSwag','Winograd','TruthfulQA','GSM8K']+mmlu_subs])))
    X_train = np.array(data_train.loc[:,['logT','logS','logS*logT']])
    X_train = torch.tensor(np.hstack((np.ones(X_train.shape[0])[:,None], X_train))).double()
    Y_test = torch.tensor(logit(np.array(data_test.loc[:,['ARC-C','HellaSwag','Winograd','TruthfulQA','GSM8K']+mmlu_subs])))
    X_test = np.array(data_test.loc[:,['logT','logS','logS*logT']])
    X_test = torch.tensor(np.hstack((np.ones(X_test.shape[0])[:,None], X_test))).double()
    D_train = torch.tensor(np.array(pd.get_dummies(np.array(data_train.family)))).double()
    D_test = torch.tensor(np.vstack([D_train[0,:].numpy() for _ in range(Y_test.shape[0])])).double()

    X2_train = torch.hstack((torch.tensor(np.array(data_train.loc[:,['logT','logS','logS*logT']])), D_train)) #torch.tensor(np.array(data_train.loc[:,['logF']]))
    X2_test = torch.hstack((torch.tensor(np.array(data_test.loc[:,['logT','logS','logS*logT']])), D_test)) #torch.tensor(np.array(data_train.loc[:,['logF']]))

    ###
    thetas = torch.nn.Parameter(torch.normal(0, .5, size=(D_train.shape[1],d), dtype=torch.float64, device=device))
    beta = torch.nn.Parameter(torch.normal(0, .5, size=(X_train.shape[1]+d,Y_train.shape[1]), dtype=torch.float64, device=device))
    beta_baseline = torch.nn.Parameter(torch.normal(0, .5, size=(X_train.shape[1],Y_train.shape[1]), dtype=torch.float64, device=device))
    beta_baseline2 = torch.nn.Parameter(torch.normal(0, .5, size=(X2_train.shape[1],Y_train.shape[1]), dtype=torch.float64, device=device))
    
    ###
    optimizer = torch.optim.LBFGS([beta,thetas], lr=.1, line_search_fn='strong_wolfe')
    losses = []
    for epoch in range(epochs):
        def closure():
            optimizer.zero_grad()
            loss = (((X_train@beta[d:] + (D_train@thetas)@beta[:d])-Y_train)**2).mean()
            loss.backward()
            return loss
        loss = optimizer.step(closure)
        abs_loss = (((X_train@beta[d:] + (D_train@thetas)@beta[:d])-Y_train).abs()).mean()
        losses.append(abs_loss.item())
    
        if epoch>=1:
            if losses[-2]-losses[-1]<=tol:
                break

    ###
    optimizer = torch.optim.LBFGS([beta_baseline], lr=.1, line_search_fn='strong_wolfe')
    losses = []
    for epoch in range(epochs):
        def closure():
            optimizer.zero_grad()
            loss = (((X_train@beta_baseline)-Y_train)**2).mean()
            loss.backward()
            return loss
        loss = optimizer.step(closure)
        abs_loss = (((X_train@beta_baseline)-Y_train).abs()).mean()
        losses.append(abs_loss.item())
    
        if epoch>=1:
            if losses[-2]-losses[-1]<=tol:
                break

    ###
    optimizer = torch.optim.LBFGS([beta_baseline2], lr=.1, line_search_fn='strong_wolfe')
    losses = []
    for epoch in range(epochs):
        def closure():
            optimizer.zero_grad()
            loss = (((X2_train@beta_baseline2)-Y_train)**2).mean()
            loss.backward()
            return loss
        loss = optimizer.step(closure)
        abs_loss = (((X2_train@beta_baseline2)-Y_train).abs()).mean()
        losses.append(abs_loss.item())
    
        if epoch>=1:
            if losses[-2]-losses[-1]<=tol:
                break

    preds.append(sigmoid(((X_test@beta[d:] + (D_test@thetas)@beta[:d]))).detach().numpy())
    preds_baseline.append(sigmoid(((X_test@beta_baseline))).detach().numpy())
    preds_baseline2.append(sigmoid(((X2_test@beta_baseline2))).detach().numpy())

    ###
    cols = [[0],[1],[2],[3],[4],list(range(Y_train.shape[1]))[5:]]
    pred_rf = np.zeros((Y_test.shape[0],Y_train.shape[1]))
    for col in cols:
        regr = RidgeCV(alphas=np.linspace(1e-5,10,20)) #RandomForestRegressor(n_estimators=1000, max_features='sqrt', random_state=0)
        regr.fit(Y_train[:,:4], Y_train[:,col].squeeze()) #np.delete(sigmoid(Y_train), col, axis=1), 
        pred_rf[:,col] = sigmoid(regr.predict(logit(preds_baseline2[-1])[:,:4]).reshape(-1,len(col)), use_torch=False) #np.delete(logit(preds_baseline2[-1]), col, axis=1)
        #pred_rf[:,[i for i in range(Y_train.shape[1]) if i not in col]] = preds_baseline2[-1][:,[i for i in range(Y_train.shape[1]) if i not in col]]
    preds_rf.append(pred_rf)
        
    ys.append(sigmoid(Y_test[:,:]).numpy())

  0%|          | 0/22 [00:00<?, ?it/s]

In [13]:
np.mean(np.abs(np.vstack(preds)[:,:5]-np.vstack(ys)[:,:5]), axis=0)

array([0.03484117, 0.04405401, 0.02929322, 0.04246133, 0.05998127])

In [14]:
np.mean(np.abs(np.vstack(preds_baseline)[:,:5]-np.vstack(ys)[:,:5]), axis=0)

array([0.05159197, 0.06620617, 0.04097824, 0.03716612, 0.0777711 ])

In [15]:
np.mean(np.abs(np.vstack(preds_baseline2)[:,:5]-np.vstack(ys)[:,:5]), axis=0)

array([0.02932462, 0.046043  , 0.02988084, 0.04813632, 0.06037273])

In [16]:
np.mean(np.abs(np.vstack(preds_rf)[:,:5]-np.vstack(ys)[:,:5]), axis=0)

array([0.02936353, 0.04601182, 0.02983732, 0.04809264, 0.09816683])

In [17]:
np.median(np.abs(np.vstack(preds)[:,:5]-np.vstack(ys)[:,:5]), axis=0)

array([0.02349686, 0.029052  , 0.02275559, 0.03488756, 0.01179707])

In [18]:
np.median(np.abs(np.vstack(preds_baseline)[:,:5]-np.vstack(ys)[:,:5]), axis=0)

array([0.03058035, 0.04576952, 0.02934638, 0.02100441, 0.02115813])

In [19]:
np.median(np.abs(np.vstack(preds_baseline2)[:,:5]-np.vstack(ys)[:,:5]), axis=0)

array([0.01784383, 0.03382038, 0.02087774, 0.04381062, 0.01115699])

In [20]:
np.median(np.abs(np.vstack(preds_rf)[:,:5]-np.vstack(ys)[:,:5]), axis=0)

array([0.01797291, 0.03410773, 0.02103038, 0.04392663, 0.02196258])

In [21]:
np.mean(np.abs(np.vstack(preds)[:,5:].mean(1)-np.vstack(ys)[:,5:].mean(1)))

0.05534097995038255

In [22]:
np.mean(np.abs(np.vstack(preds_baseline)[:,5:].mean(1)-np.vstack(ys)[:,5:].mean(1)))

0.07484974350315568

In [23]:
np.mean(np.abs(np.vstack(preds_baseline2)[:,5:].mean(1)-np.vstack(ys)[:,5:].mean(1)))

0.05556360186100937

In [24]:
np.mean(np.abs(np.vstack(preds_rf)[:,5:].mean(1)-np.vstack(ys)[:,5:].mean(1)), axis=0)

0.07491827144189878

In [21]:
np.median(np.abs(np.vstack(preds)[:,5:].mean(1)-np.vstack(ys)[:,5:].mean(1)))

0.04061560915989798

In [22]:
np.median(np.abs(np.vstack(preds_baseline)[:,5:].mean(1)-np.vstack(ys)[:,5:].mean(1)))

0.07036375818993645

In [23]:
np.median(np.abs(np.vstack(preds_baseline2)[:,5:].mean(1)-np.vstack(ys)[:,5:].mean(1)))

0.04036635411883588