In [1]:
import pandas as pd
from selection import OpenFormCostComputer, OpenFormQualityComputer, CascadeRouter, Router, ConstantStrategy, HyperoptStrategy, BaselineCascader, RepetitiveConstantStrategy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from tqdm import tqdm
from transformers import AutoTokenizer
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
from loguru import logger
import sys
import os
from collections import Counter

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

# set logger to only show info messages
logger.remove()
logger.add(sys.stdout, level="INFO")


  from .autonotebook import tqdm as notebook_tqdm


1

In [2]:
dataset = 'mmlu_gsm8k'
data_folder = '../data/free_form'

train_model_answers = pd.read_json(os.path.join(data_folder, dataset, 'train', 'model_answers.json'))
train_costs = pd.read_json(os.path.join(data_folder, dataset, 'train', 'costs.json'))
train_qualities = pd.read_json(os.path.join(data_folder, dataset, 'train', 'qualities.json'))
train_queries = pd.read_json(os.path.join(data_folder, dataset, 'train', 'queries.json'))
test_model_answers = pd.read_json(os.path.join(data_folder, dataset, 'test', 'model_answers.json'))
test_costs = pd.read_json(os.path.join(data_folder, dataset, 'test', 'costs.json'))
test_qualities = pd.read_json(os.path.join(data_folder, dataset, 'test', 'qualities.json'))
test_queries = pd.read_json(os.path.join(data_folder, dataset, 'test', 'queries.json'))

train_queries = list(train_queries.apply(lambda x: [x[0][0][1] + '\nQuestion: ' + x[1][1] + '\nAnswer:' + x[2][1] + '\nQuestion:' + x[3][1]] + x[0][1:], axis=1))
test_queries = list(test_queries.apply(lambda x: [x[0][0][1] + '\nQuestion: ' + x[1][1] + '\nAnswer:' + x[2][1] + '\nQuestion:' + x[3][1]] + x[0][1:], axis=1))

In [3]:
def convert_to_numpy(model_answers, costs, qualities, models):
    model_answers = model_answers[models].values
    for i in range(len(model_answers)):
        for j in range(len(model_answers[i])):
            model_answers[i, j][1] = np.array(model_answers[i, j][1])
    costs = costs[models].values
    qualities = qualities[models].values
    return model_answers, costs, qualities

In [4]:
test_qualities_averaged = test_qualities.mean(axis=0)
test_costs_averaged = test_costs.mean(axis=0)

In [5]:
train_qualities_averaged = train_qualities.mean(axis=0)
train_costs_averaged = train_costs.mean(axis=0)

In [6]:
test_qualities_averaged, test_costs_averaged

(meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo      0.613338
 meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo     0.787159
 meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    0.825470
 google/gemma-2-9b-it                             0.595956
 google/gemma-2-27b-it                            0.654487
 google/gemma-2b-it                               0.105002
 mistralai/Mistral-7B-Instruct-v0.3               0.395530
 mistralai/Mixtral-8x22B-Instruct-v0.1            0.668677
 mistralai/Mixtral-8x7B-Instruct-v0.1             0.521107
 dtype: float64,
 meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo      0.000117
 meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo     0.000549
 meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    0.003132
 google/gemma-2-9b-it                             0.000187
 google/gemma-2-27b-it                            0.000503
 google/gemma-2b-it                               0.000047
 mistralai/Mistral-7B-Instruct-v0.3               0.000138
 mistralai/Mixtral-8x22B-Instruct-v0.1 

In [7]:
train_qualities_averaged, train_costs_averaged

(meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo      0.682667
 meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo     0.810333
 meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    0.859667
 google/gemma-2-9b-it                             0.636667
 google/gemma-2-27b-it                            0.676333
 google/gemma-2b-it                               0.093667
 mistralai/Mistral-7B-Instruct-v0.3               0.415667
 mistralai/Mixtral-8x22B-Instruct-v0.1            0.685000
 mistralai/Mixtral-8x7B-Instruct-v0.1             0.544000
 dtype: float64,
 meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo      0.000113
 meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo     0.000533
 meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    0.003012
 google/gemma-2-9b-it                             0.000182
 google/gemma-2-27b-it                            0.000489
 google/gemma-2b-it                               0.000046
 mistralai/Mistral-7B-Instruct-v0.3               0.000136
 mistralai/Mixtral-8x22B-Instruct-v0.1 

In [8]:
def prediction(cascader, questions, qualities, costs, actual_answers, models, is_router=False):
    qualities_output = []
    costs_output = []
    models_run = []
    selected_models = []
    for i, question in enumerate(questions):
        model_answers = [[None for _ in range(len(qualities[i]))]]
        cost = 0
        models_run_question = []
        for step in range(len(model_answers[0])):
            model = cascader.predict([question], model_answers)
            if model[0] is None:
                break
            else:
                model_index = models.index(model[0])
                models_run_question.append(model_index)
                model_answers[0][model_index] = actual_answers[i][model_index]
                cost += costs[i][model_index]
            if is_router:
                break
        selected_answer = cascader.select_answer([question], model_answers)
        selected_model = models.index(selected_answer[0])
        quality = qualities[i][models.index(selected_answer[0])]
        qualities_output.append(quality)
        costs_output.append(cost)
        models_run.append(','.join([str(model) for model in models_run_question]))
        selected_models.append(selected_model)
    counter = Counter(models_run)
    counter_selected = Counter(selected_models)
    return {
        'quality': np.mean(qualities_output),
        'cost': np.mean(costs_output),
        'models_run': counter,
        'selected_models': counter_selected,
        'lambdas': list(cascader.get_lambdas())
    }

In [9]:
all_models = [
        {
            'name': 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'huggingface_name': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
            'read_cost': 0.18 * 10 ** -6,
            'write_cost': 0.18 * 10 ** -6
        },
        {
            'name': 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
            'huggingface_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',
            'read_cost': 0.88 * 10 ** -6,
            'write_cost': 0.88 * 10 ** -6
        },
        {
            'name': 'google/gemma-2-9b-it',
            'read_cost': 0.3 * 10 ** -6,
            'write_cost': 0.3 * 10 ** -6
        },
        {
            'name': 'google/gemma-2-27b-it',
            'read_cost': 0.8 * 10 ** -6,
            'write_cost': 0.8 * 10 ** -6
        },
        {
            'name': 'google/gemma-2b-it',
            'read_cost': 0.1 * 10 ** -6,
            'write_cost': 0.1 * 10 ** -6
        },
        {
            'name': 'mistralai/Mistral-7B-Instruct-v0.3',
            'read_cost': 0.2 * 10 ** -6,
            'write_cost': 0.2 * 10 ** -6
        },
        {
            'name': 'mistralai/Mixtral-8x22B-Instruct-v0.1',
            'read_cost': 1.2 * 10 ** -6,
            'write_cost': 1.2 * 10 ** -6
        },
        {
            'name': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
            'read_cost': 0.6 * 10 ** -6,
            'write_cost': 0.6 * 10 ** -6
        },
    ]

In [10]:

class MinMaxLinearRegression(LinearRegression):
    def predict(self, X):
        return np.clip(super().predict(X), 0, 1)

def log_reg():
    return MinMaxLinearRegression()

In [11]:
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

quality_computer_baseline = OpenFormQualityComputer(
        model_class=lambda: LogisticRegression(max_iter=1000),
        require_constant_not_run=False,
        baseline=True,
        include_question_embedding=False,
        do_cosine_similarity=False,
        n_training_samples=128,
        is_regression=False,
        all_model_combinations=False,
        baseline_feature='mean'
    )
quality_computer = OpenFormQualityComputer(
        model_class=lambda: RandomForestClassifier(max_depth=1),
        require_constant_not_run=False,
        baseline=False,
        include_question_embedding=False,
        do_cosine_similarity=False,
        n_training_samples=128,
        is_regression=False,
        all_model_combinations=True,
        store_all=True,
        include_all_models=False
    )

In [12]:
models = [all_models[0], all_models[1], all_models[2]]

model_names = [model['name'] for model in models]
train_model_answers_here, train_costs_here, train_qualities_here = convert_to_numpy(train_model_answers, train_costs, train_qualities, model_names)

In [13]:
quality_computer.fit(train_queries[:1500], train_model_answers_here[:1500], train_qualities_here[:1500])

Model 0: 100%|██████████| 8/8 [00:01<00:00,  5.23it/s]
Model 1: 100%|██████████| 8/8 [00:01<00:00,  6.16it/s]
Model 2: 100%|██████████| 8/8 [00:01<00:00,  5.83it/s]


In [14]:
quality_computer_baseline.fit(train_queries[:1500], train_model_answers_here[:1500], train_qualities_here[:1500])

In [15]:
import time

def whiten_samples(samples, covariances):
    whitened_samples = []
    for x_i, sigma_i in zip(samples, covariances):
        # Eigenvalue decomposition
        eigvals, eigvecs = np.linalg.eigh(sigma_i)
        
        # Construct the whitening matrix
        W_i = eigvecs @ np.diag(1.0 / np.sqrt(eigvals)) @ eigvecs.T
        
        # Apply the whitening matrix
        y_i = W_i @ x_i
        whitened_samples.append(y_i)
        
    return np.array(whitened_samples)


def compute_accuracy_per_model(real, predictions, var_predictions, predictions_all=None):
    model_accuracies = []
    model_losses = []
    normal_losses = []
    for model in range(real.shape[1]):
        predictions_acc = (predictions[:, model] > 0.5).astype(int)
        real_acc = real[:, model]
        model_accuracies.append(np.mean(predictions_acc == real_acc))
        # model_losses.append(np.mean(np.log(predictions[:, model]) * real[:, model] + np.log(1 - predictions[:, model]) * (1 - real[:, model])))
        # absolute error
        model_losses.append(np.mean(np.abs(predictions[:, model] - real[:, model])))
    
    if predictions_all is None:
        normal_losses = 0
    else:
        normal_unit_converted = whiten_samples(predictions - predictions_all, var_predictions)
        cov = np.cov(normal_unit_converted.T)
        # compute distance between cov and identity matrix
        normal_losses = np.linalg.norm(cov - np.eye(cov.shape[0]))
    return model_accuracies, model_losses, normal_losses

for n_answers in range(4):
    print(n_answers)
    train_here = train_model_answers_here[1500:].copy()

    t = time.time()
    predictions_all, _ = quality_computer.predict(
        train_queries[1500:], train_model_answers_here[1500:]
    )
    print('Time:', time.time() - t)
    predictions_all_baseline, _ = quality_computer_baseline.predict(
        train_queries[1500:], train_model_answers_here[1500:]
    )

    for index in range(len(train_here)):
        for j in range(len(train_here[index]) - 1, n_answers - 1, -1):
            train_here[index][j] = None

    predictions, var_predictions = quality_computer.predict(
        train_queries[1500:], train_here
    )

    predictions_baseline, var_predictions_baseline = quality_computer_baseline.predict(
        train_queries[1500:], train_here
    )

    model_accuracies, model_losses, normal_losses = compute_accuracy_per_model(train_qualities_here[1500:], predictions, var_predictions, predictions_all)
    print('Model accuracies:', model_accuracies)
    print('Model losses:', model_losses)
    print('Normal losses:', normal_losses)
    model_accuracies_baseline, model_losses_baseline, normal_losses_baseline = compute_accuracy_per_model(train_qualities_here[1500:], predictions_baseline, var_predictions_baseline)
    print('Model accuracies baseline:', model_accuracies_baseline)
    print('Model losses baseline:', model_losses_baseline)
    print('Normal losses baseline:', normal_losses_baseline)

0
Time: 0.3213496208190918
Model accuracies: [0.7246666666666667, 0.818, 0.7533333333333333]
Model losses: [0.32724806664044115, 0.2507200916365273, 0.3239515557311815]
Normal losses: 0.0677273139609472
Model accuracies baseline: [0.6846666666666666, 0.818, 0.6426666666666667]
Model losses baseline: [0.43327377777777776, 0.307504, 0.4627164444444445]
Normal losses baseline: 0
1
Time: 0.01784491539001465
Model accuracies: [0.7586666666666667, 0.818, 0.7533333333333333]
Model losses: [0.33320405911743284, 0.25095772090184426, 0.323610245070331]
Normal losses: 0.06502160545446456
Model accuracies baseline: [0.738, 0.818, 0.6426666666666667]
Model losses baseline: [0.35733354563549224, 0.307504, 0.4627164444444445]
Normal losses baseline: 0
2
Time: 0.017685890197753906
Model accuracies: [0.8613333333333333, 0.818, 0.7533333333333333]
Model losses: [0.2959633946988524, 0.2503908167425938, 0.3244285631275072]
Normal losses: 0.0998912015251974
Model accuracies baseline: [0.7026666666666667, 0

  W_i = eigvecs @ np.diag(1.0 / np.sqrt(eigvals)) @ eigvecs.T
  W_i = eigvecs @ np.diag(1.0 / np.sqrt(eigvals)) @ eigvecs.T


In [31]:
index = 660
quality_computer.generate_sample_input_output(train_queries[index], 0, 3, [train_model_answers_here[index][0], None, None], train_qualities_here[index])

([1.0,
  0.0,
  0.1,
  -0.9844693852359999,
  -0.4068055310892562,
  -8.375,
  -0.026550292500000003,
  5.488937726156687,
  1],
 1)

In [13]:
from transformers import AutoTokenizer

tokenizers = [
    AutoTokenizer.from_pretrained(model['name']) for model in models
]
input_costs = [
    model['read_cost'] for model in models
]

output_costs = [
    model['write_cost'] for model in models
]

cost_computer = OpenFormCostComputer(input_costs, output_costs, tokenizers, store_all=False)

In [14]:
cost_computer.fit(train_queries[:600], train_model_answers_here[:600], train_costs[:600])

In [15]:
prediction = cost_computer.predict(train_queries[600:], [[None, None, None] for index in range(len(train_queries[600:]))])

In [35]:
prediction[0]

array([[0.0001252, 0.0003996, 0.0007752],
       [0.0001694, 0.0005094, 0.0010176],
       [0.000343 , 0.0008676, 0.0018966],
       ...,
       [0.0001402, 0.0004776, 0.0008982],
       [0.0001246, 0.000387 , 0.0007608],
       [0.000153 , 0.0004692, 0.0009282]])

In [22]:
train_costs_here[600:]

array([[0.000128 , 0.000411 , 0.0007572],
       [0.0001722, 0.0005208, 0.001092 ],
       [0.0003458, 0.000879 , 0.0015336],
       ...,
       [0.000143 , 0.000489 , 0.000954 ],
       [0.0001274, 0.0003984, 0.0007884],
       [0.0001556, 0.00048  , 0.0010092]])

In [31]:
questions = train_queries[600:620]
model_answers = [[train_model_answers_here[index][0], train_model_answers_here[index][1], None] for index in range(len(train_queries[600:620]))]
length_models = len(model_answers[0])

all_costs = []
for i in range(len(questions)):
    models_run = ','.join([str(int(model_answers[i][model] is not None)) for model in range(length_models)])
    tokenized_question = questions[i]
    tokenized_model_answers = [answer[0] if answer is not None else None for answer in model_answers[i]]
    if cost_computer.tokenize:
        tokenized_question = [cost_computer.tokenizers[model]([questions[i]], padding=False)['input_ids'][0] for model in range(length_models)]
        tokenized_model_answers = [
            cost_computer.tokenizers[model]([answer[0]], padding=False)['input_ids'][0] if answer is not None else None
            for answer, model in zip(model_answers[i], range(length_models))
        ]
    costs = []
    for model in range(length_models):
        if cost_computer.constant_cost:
            cost = cost_computer.constant_costs[model]
            costs.append(cost)
            continue
        cost = cost_computer.input_costs[model] * len(tokenized_question[model])
        if model_answers[i][model] is None:
            cost += cost_computer.average_output_cost[model]
        else:
            cost += cost_computer.output_costs[model] * len(tokenized_model_answers[model])
        costs.append(cost)

    all_costs.append(costs)

[[1, 2559, 13649, 1040, 3466, 1070, 1032, 25179, 1163, 5822, 16555, 1070, 26226, 29473, 29508, 29502, 29493, 29473, 29508, 29518, 29493, 1072, 29473, 29508, 29550, 29493, 1246, 1309, 1706, 3352, 1034, 29510, 29481, 12071, 29493, 1458, 1117, 8426, 1507, 1040, 9248, 1228, 1227, 2846, 29491, 3352, 1034, 29510, 29481, 12071, 5373, 1137, 1040, 3466, 1093, 29509, 29499, 1070, 1032, 25179, 1163, 2849, 26226, 1032, 29493, 1055, 29493, 1072, 1045, 1117, 2846, 1254, 29515, 781, 781, 29509, 1095, 19096, 3344, 29560, 29481, 29500, 29481, 29501, 29476, 4615, 29481, 29501, 29494, 4615, 29481, 29501, 29485, 5521, 781, 781, 3724, 1036, 1117, 1040, 13342, 29501, 1255, 17427, 29493, 1458, 1117, 14275, 1158, 1040, 5919, 1070, 1040, 2849, 26226, 1093, 29481, 1095, 1093, 29476, 29574, 29494, 29574, 29485, 10978, 29518, 1377, 1328, 1224, 1990, 29493, 1093, 29476, 29514, 29508, 29502, 1325, 1093, 29494, 29514, 29508, 29518, 1325, 1072, 1093, 29485, 29514, 29508, 29550, 1377, 781, 781, 8257, 29493, 1246, 1467

In [30]:
all_costs

[[0.0001984, 0.0005147999999999999, 0.0006700828866666666],
 [0.0002006, 0.0005945999999999999, 0.0008800828866666665],
 [0.00024159999999999996, 0.0007812, 0.0013468828866666667],
 [0.00015419999999999998, 0.0005046, 0.0008536828866666666],
 [0.00020800000000000001, 0.0006516, 0.0010252828866666667],
 [0.00024419999999999997, 0.0006203999999999999, 0.0007240828866666665],
 [0.00011699999999999998, 0.00040019999999999997, 0.0005800828866666666],
 [0.0002014, 0.000531, 0.0009364828866666666],
 [0.000225, 0.0005184, 0.0006676828866666666],
 [0.00013059999999999998, 0.00048059999999999997, 0.0006268828866666666],
 [0.00013759999999999998, 0.00044579999999999994, 0.0007864828866666665],
 [0.00010599999999999999, 0.00032399999999999996, 0.0005968828866666666],
 [0.0001254, 0.00041999999999999996, 0.0006664828866666666],
 [0.0001258, 0.0003804, 0.0006220828866666666],
 [0.00011679999999999999, 0.00034439999999999997, 0.0006136828866666666],
 [0.000121, 0.000324, 0.0005620828866666666],
 [0.0

In [33]:
all_costs = []
for model in range(length_models):
    model_answers_this = [model_answers[i][model][0] if model_answers[i][model] is not None else '' for i in range(len(model_answers))]
    
    costs = []
    for i in range(len(questions)):
        tokenized_question = cost_computer.tokenizers[model]([questions[i]], padding=False)['input_ids'][0]
        tokenized_model_answer = cost_computer.tokenizers[model]([model_answers_this[i]], padding=False)['input_ids'][0]
        cost = cost_computer.input_costs[model] * len(tokenized_question)
        if model_answers[i][model] is None:
            cost += cost_computer.average_output_cost[model]
        else:
            cost += cost_computer.output_costs[model] * len(tokenized_model_answer)
        costs.append(cost)

    all_costs.append(costs)

np.array(all_costs).T

array([[0.0001984 , 0.0005148 , 0.00067008],
       [0.0002006 , 0.0005946 , 0.00088008],
       [0.0002416 , 0.0007812 , 0.00134688],
       [0.0001542 , 0.0005046 , 0.00085368],
       [0.000208  , 0.0006516 , 0.00102528],
       [0.0002442 , 0.0006204 , 0.00072408],
       [0.000117  , 0.0004002 , 0.00058008],
       [0.0002014 , 0.000531  , 0.00093648],
       [0.000225  , 0.0005184 , 0.00066768],
       [0.0001306 , 0.0004806 , 0.00062688],
       [0.0001376 , 0.0004458 , 0.00078648],
       [0.000106  , 0.000324  , 0.00059688],
       [0.0001254 , 0.00042   , 0.00066648],
       [0.0001258 , 0.0003804 , 0.00062208],
       [0.0001168 , 0.0003444 , 0.00061368],
       [0.000121  , 0.000324  , 0.00056208],
       [0.0001742 , 0.0005034 , 0.00081648],
       [0.0001802 , 0.0005028 , 0.00079968],
       [0.0002236 , 0.000639  , 0.00067608],
       [0.000199  , 0.0005664 , 0.00104928]])