In [1]:
import pandas as pd
from selection import ClassificationCostComputer, ClassificationQualityComputer, CascadeRouter, Router, ConstantStrategy, HyperoptStrategy, BaselineCascader, RepetitiveConstantStrategy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from tqdm import tqdm
from transformers import AutoTokenizer
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
from loguru import logger
import sys
import os
from collections import Counter

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

# set logger to only show info messages
logger.remove()
logger.add(sys.stdout, level="INFO")


  from .autonotebook import tqdm as notebook_tqdm


1

In [47]:
dataset = 'mmlu_arc_mixeval'
data_folder = '../data/classification'

train_model_answers = pd.read_json(os.path.join(data_folder, dataset, 'train', 'model_answers.json'))
train_costs = pd.read_json(os.path.join(data_folder, dataset, 'train', 'costs.json'))
train_qualities = pd.read_json(os.path.join(data_folder, dataset, 'train', 'qualities.json'))
train_queries = list(pd.read_json(os.path.join(data_folder, dataset, 'train', 'queries.json'))[0])
test_model_answers = pd.read_json(os.path.join(data_folder, dataset, 'test', 'model_answers.json'))
test_costs = pd.read_json(os.path.join(data_folder, dataset, 'test', 'costs.json'))
test_qualities = pd.read_json(os.path.join(data_folder, dataset, 'test', 'qualities.json'))
test_queries = list(pd.read_json(os.path.join(data_folder, dataset, 'test', 'queries.json'))[0])

In [48]:
def convert_to_numpy(model_answers, costs, qualities, models):
    model_answers = model_answers[models].values
    for i in range(len(model_answers)):
        for j in range(len(model_answers[i])):
            model_answers[i, j] = np.array(model_answers[i, j])
    costs = costs[models].values
    qualities = qualities[models].values
    return model_answers, costs, qualities

In [49]:
test_qualities_averaged = test_qualities.mean(axis=0)
test_costs_averaged = test_costs.mean(axis=0)

In [50]:
train_qualities_averaged = train_qualities.mean(axis=0)
train_costs_averaged = train_costs.mean(axis=0)

In [51]:
test_qualities_averaged, test_costs_averaged

(meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo      0.599673
 meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo     0.726307
 meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    0.786492
 google/gemma-2-9b-it                             0.665577
 google/gemma-2-27b-it                            0.692266
 google/gemma-2b-it                               0.257625
 mistralai/Mistral-7B-Instruct-v0.3               0.549564
 mistralai/Mixtral-8x22B-Instruct-v0.1            0.698529
 mistralai/Mixtral-8x7B-Instruct-v0.1             0.629085
 dtype: float64,
 meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo      0.000048
 meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo     0.000236
 meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    0.001342
 google/gemma-2-9b-it                             0.000082
 google/gemma-2-27b-it                            0.000220
 google/gemma-2b-it                               0.000027
 mistralai/Mistral-7B-Instruct-v0.3               0.000060
 mistralai/Mixtral-8x22B-Instruct-v0.1 

In [52]:
train_qualities_averaged, train_costs_averaged

(meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo      0.610942
 meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo     0.729760
 meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    0.810998
 google/gemma-2-9b-it                             0.666759
 google/gemma-2-27b-it                            0.700746
 google/gemma-2b-it                               0.249240
 mistralai/Mistral-7B-Instruct-v0.3               0.555678
 mistralai/Mixtral-8x22B-Instruct-v0.1            0.701299
 mistralai/Mixtral-8x7B-Instruct-v0.1             0.626416
 dtype: float64,
 meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo      0.000048
 meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo     0.000234
 meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    0.001328
 google/gemma-2-9b-it                             0.000081
 google/gemma-2-27b-it                            0.000217
 google/gemma-2b-it                               0.000027
 mistralai/Mistral-7B-Instruct-v0.3               0.000059
 mistralai/Mixtral-8x22B-Instruct-v0.1 

In [53]:
def prediction(cascader, questions, qualities, costs, actual_answers, models, is_router=False):
    qualities_output = []
    costs_output = []
    models_run = []
    selected_models = []
    for i, question in enumerate(questions):
        model_answers = [[None for _ in range(len(qualities[i]))]]
        cost = 0
        models_run_question = []
        for step in range(len(model_answers[0])):
            model = cascader.predict([question], model_answers)
            if model[0] is None:
                break
            else:
                model_index = models.index(model[0])
                models_run_question.append(model_index)
                model_answers[0][model_index] = actual_answers[i][model_index]
                cost += costs[i][model_index]
            if is_router:
                break
        selected_answer = cascader.select_answer([question], model_answers)
        selected_model = models.index(selected_answer[0])
        quality = qualities[i][models.index(selected_answer[0])]
        qualities_output.append(quality)
        costs_output.append(cost)
        models_run.append(','.join([str(model) for model in models_run_question]))
        selected_models.append(selected_model)
    counter = Counter(models_run)
    counter_selected = Counter(selected_models)
    return {
        'quality': np.mean(qualities_output),
        'cost': np.mean(costs_output),
        'models_run': counter,
        'selected_models': counter_selected,
        'lambdas': list(cascader.get_lambdas())
    }

In [54]:
all_models = [
        {
            'name': 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
            'huggingface_name': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
            'read_cost': 0.18 * 10 ** -6,
            'write_cost': 0.18 * 10 ** -6
        },
        {
            'name': 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
            'huggingface_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',
            'read_cost': 0.88 * 10 ** -6,
            'write_cost': 0.88 * 10 ** -6
        },
        {
            'name': 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
            'huggingface_name': 'meta-llama/Meta-Llama-3.1-405B-Instruct',
            'read_cost': 0.88 * 10 ** -6,
            'write_cost': 0.88 * 10 ** -6
        },
        {
            'name': 'google/gemma-2-9b-it',
            'read_cost': 0.3 * 10 ** -6,
            'write_cost': 0.3 * 10 ** -6
        },
        {
            'name': 'google/gemma-2-27b-it',
            'read_cost': 0.8 * 10 ** -6,
            'write_cost': 0.8 * 10 ** -6
        },
        {
            'name': 'google/gemma-2b-it',
            'read_cost': 0.1 * 10 ** -6,
            'write_cost': 0.1 * 10 ** -6
        },
        {
            'name': 'mistralai/Mistral-7B-Instruct-v0.3',
            'read_cost': 0.2 * 10 ** -6,
            'write_cost': 0.2 * 10 ** -6
        },
        {
            'name': 'mistralai/Mixtral-8x22B-Instruct-v0.1',
            'read_cost': 1.2 * 10 ** -6,
            'write_cost': 1.2 * 10 ** -6
        },
        {
            'name': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
            'read_cost': 0.6 * 10 ** -6,
            'write_cost': 0.6 * 10 ** -6
        },
    ]

In [55]:

class MinMaxLinearRegression(LinearRegression):
    def predict(self, X):
        return np.clip(super().predict(X), 0, 1)

def log_reg():
    return MinMaxLinearRegression()

In [28]:
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

quality_computer_baseline = ClassificationQualityComputer(
        model_class=lambda: LinearRegression(),
        n_highest_include=1,
        require_constant_not_run=False,
        baseline=True,
        include_question_embedding=False,
        do_cosine_similarity=False,
        is_regression=True,
        all_model_combinations=False,
        add_entropy=False,
        add_equal_argmax=False,
        add_js_divergence=False,
        include_question_length=True
    )
quality_computer = ClassificationQualityComputer(
        model_class=lambda: LogisticRegression(penalty=None),
        n_highest_include=2,
        require_constant_not_run=False,
        baseline=False,
        include_question_embedding=False,
        do_cosine_similarity=False,
        is_regression=False,
        all_model_combinations=True,
        add_entropy=False,
        add_equal_argmax=True,
        add_js_divergence=False,
        include_question_length=False,
        store_all=True,
        include_all_models=False,
        lookup_file_name=os.path.join(data_folder, dataset, 'embeddings', 'queries.json'),
    )

In [29]:
models = [all_models[5], all_models[0], all_models[1], all_models[2]]

model_names = [model['name'] for model in models]
train_model_answers_here, train_costs_here, train_qualities_here = convert_to_numpy(train_model_answers, train_costs, train_qualities, model_names)

In [30]:
test_model_answers_here, test_costs_here, test_qualities_here = convert_to_numpy(test_model_answers, test_costs, test_qualities, model_names)

In [31]:
quality_computer.fit(train_queries[:1800], train_model_answers_here[:1800], train_qualities_here[:1800])
quality_computer_baseline.fit(train_queries[:1800], train_model_answers_here[:1800], train_qualities_here[:1800])

Model 0: 100%|██████████| 16/16 [00:01<00:00, 14.23it/s]
Model 1: 100%|██████████| 16/16 [00:01<00:00, 13.26it/s]
Model 2: 100%|██████████| 16/16 [00:01<00:00, 14.17it/s]
Model 3: 100%|██████████| 16/16 [00:01<00:00, 14.52it/s]


In [56]:
import time

def whiten_samples(samples, covariances):
    whitened_samples = []
    for x_i, sigma_i in zip(samples, covariances):
        # Eigenvalue decomposition
        eigvals, eigvecs = np.linalg.eigh(sigma_i)
        
        # Construct the whitening matrix
        W_i = eigvecs @ np.diag(1.0 / np.sqrt(eigvals)) @ eigvecs.T
        
        # Apply the whitening matrix
        y_i = W_i @ x_i
        whitened_samples.append(y_i)
        
    return np.array(whitened_samples)


def compute_accuracy_per_model(real, predictions, var_predictions, predictions_all=None):
    model_accuracies = []
    model_losses = []
    normal_losses = []
    for model in range(real.shape[1]):
        predictions_acc = (predictions[:, model] > 0.5).astype(int)
        real_acc = real[:, model]
        model_accuracies.append(np.mean(predictions_acc == real_acc))
        # model_losses.append(np.mean(np.log(predictions[:, model]) * real[:, model] + np.log(1 - predictions[:, model]) * (1 - real[:, model])))
        # absolute error
        model_losses.append(np.mean(np.square(predictions[:, model] - real[:, model])))
    
    if predictions_all is None:
        normal_losses = 0
    else:
        normal_unit_converted = whiten_samples(predictions - predictions_all, var_predictions)
        cov = np.cov(normal_unit_converted.T)
        # compute distance between cov and identity matrix
        normal_losses = np.linalg.norm(cov - np.eye(cov.shape[0]))
    return model_accuracies, model_losses, normal_losses

for n_answers in range(5):
    print(n_answers)
    train_here = test_model_answers_here.copy()

    t = time.time()
    predictions_all, _ = quality_computer.predict(
        test_queries, test_model_answers_here
    )
    print('Time:', time.time() - t)
    predictions_all_baseline, _ = quality_computer_baseline.predict(
        test_queries, test_model_answers_here
    )

    for index in range(len(train_here)):
        for j in range(len(train_here[index]) - 1, n_answers - 1, -1):
            train_here[index][j] = None

    predictions, var_predictions = quality_computer.predict(
        test_queries, train_here
    )

    predictions_baseline, var_predictions_baseline = quality_computer_baseline.predict(
        test_queries, train_here
    )

    model_accuracies, model_losses, normal_losses = compute_accuracy_per_model(test_qualities_here, predictions, var_predictions, predictions_all)
    print('Model accuracies:', model_accuracies)
    print('Model losses:', model_losses)
    print('Normal losses:', normal_losses)
    model_accuracies_baseline, model_losses_baseline, normal_losses_baseline = compute_accuracy_per_model(test_qualities_here, predictions_baseline, var_predictions_baseline)
    print('Model accuracies baseline:', model_accuracies_baseline)
    print('Model losses baseline:', model_losses_baseline)
    print('Normal losses baseline:', normal_losses_baseline)

0
Time: 0.10339617729187012
Model accuracies: [0.7423747276688453, 0.7156862745098039, 0.7263071895424836, 0.7864923747276689]
Model losses: [0.18339623804690564, 0.1986865860528536, 0.16339347357586112, 0.14507886760312547]
Normal losses: 0.07116637168037891
Model accuracies baseline: [0.7423747276688453, 0.5996732026143791, 0.7263071895424836, 0.7864923747276689]
Model losses baseline: [0.19127291212781403, 0.24008302469135806, 0.198832153231663, 0.1681873335754055]
Normal losses baseline: 0
1
Time: 0.06497716903686523
Model accuracies: [0.7423747276688453, 0.7156862745098039, 0.7263071895424836, 0.7864923747276689]
Model losses: [0.1837849222951397, 0.1986865860528536, 0.16339347357586112, 0.14507886760312547]
Normal losses: 0.07814294470890375
Model accuracies baseline: [0.7423747276688453, 0.5996732026143791, 0.7263071895424836, 0.7864923747276689]
Model losses baseline: [0.2576252723311547, 0.24008302469135806, 0.198832153231663, 0.1681873335754055]
Normal losses baseline: 0
2
Ti

  W_i = eigvecs @ np.diag(1.0 / np.sqrt(eigvals)) @ eigvecs.T
  W_i = eigvecs @ np.diag(1.0 / np.sqrt(eigvals)) @ eigvecs.T


In [37]:
index = 0
print(quality_computer.generate_sample_input_output(
    test_queries[index], 0, 4, [None, None, None, None]
))

([0.0, 1.0, 0.0, 0.25], None)


In [38]:
quality_computer.models[0][''].coef_, quality_computer.models[0][''].intercept_

(array([[-1.06428765,  0.24187216,  0.02706587, -0.03862987]]),
 array([-0.79534962]))

In [21]:
quality_computer.base_features(
    test_queries[index], None
)

['Question: The cell structure that makes a plant cell more rigid than an animal cell is the\nA: cell membrane.\nB: cytoplasm.\nC: cell wall.\nD: ribosome.\n\nAnswer: C\n\nQuestion: Garden plants on Earth require four resources to stay alive: soil, air, water, and sunlight. How many of these resources are necessary for life to exist on the moon or another planet?\nA: 4\nB: 3\nC: 2\nD: 1\n\nAnswer:'] [array([0.29303635, 0.69750749, 0.79207283, 0.85444476])]


[0.0,
 1.0,
 0.0,
 0.25,
 0.29303635202569184,
 0.69750748644879,
 0.7920728253923097,
 0.8544447605214256]

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
new_questions = [test_queries[index][0]]
question_embedding = quality_computer.compute_sentence_embeddings(new_questions)
estimated_qualities = []
for question, embedding in zip(new_questions, [question_embedding]):
    if not isinstance(question, str):
        question = question[0]
    cosine_similarity_q = cosine_similarity(embedding.reshape(1, -1), quality_computer.question_embeddings)
    estimated_qualities.append(
        np.sum(cosine_similarity_q.reshape(-1, 1) * quality_computer.qualities_embeddings, axis=0) / np.sum(cosine_similarity_q)
    )
    print(quality_computer.qualities_embeddings)
print(new_questions)
print(estimated_qualities)


[[0 1 1 1]
 [0 1 1 1]
 [0 1 1 1]
 ...
 [0 1 1 1]
 [0 1 1 1]
 [0 0 1 1]]
['Question: The cell structure that makes a plant cell more rigid than an animal cell is the\nA: cell membrane.\nB: cytoplasm.\nC: cell wall.\nD: ribosome.\n\nAnswer: C\n\nQuestion: Garden plants on Earth require four resources to stay alive: soil, air, water, and sunlight. How many of these resources are necessary for life to exist on the moon or another planet?\nA: 4\nB: 3\nC: 2\nD: 1\n\nAnswer:']
[array([0.29303635, 0.69750749, 0.79207283, 0.85444476])]
