In [14]:
import openai
from tenacity import (
    retry,
    wait_random_exponential,
)
import json
import pickle
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

from open_ai_key import API_KEY

openai.api_key = API_KEY

  from tqdm.autonotebook import tqdm, trange


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache_dir = "C:\\LLMs"
sentence_model = SentenceTransformer('all-mpnet-base-v2', cache_folder = cache_dir)
# sentence_model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True, cache_folder = cache_dir)



In [9]:
@retry(wait=wait_random_exponential(min=1, max=60))
def query_api_any_message(message, engine, **kwargs):
    if "temperature" not in kwargs:
        kwargs["temperature"] = 0.0
    if engine == "gpt-4" or engine == "gpt-3.5-turbo":
        message_dict = [{"role": "user", "content": message}]
        response = openai.ChatCompletion.create(
            model=engine,
            messages=message_dict,
            **kwargs
        )
    else:
        response = openai.Completion.create(
            engine=engine,
            prompt=message,
            **kwargs
        )
    return response

In [18]:
with open('query_to_time_embedding_dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [None]:
def query_api(question):
    embedding = sentence_model.encode(question)
    embedding = embedding / np.linalg.norm(embedding)
    top_k = []
    for data in dataset:
        d = data.copy()
        d['sim_score'] = embedding.T @ data['embedding']
        top_k.append(d)

    top_k.sort(key=lambda x: x['sim_score'], reverse=True)

    message = "I'm paying you $100,000 to do this task correctly. A human is given a question. Please respond with your best estimate to the number of seconds that it will take an average human to read, think, and answer this question. "
    message += f"For example, when given the question '{top_k[0]['question']}', a user takes on average {top_k[0]['response_time']} seconds to respond. "
    message += f"As another example, users are given the following question: '{top_k[1]['question']}'. "
    message += f"The average response time to this question is {top_k[1]['response_time']} seconds. "
    message += f"Now, a user is given the question: '{question}' "
    message += " What is your best estimate of the number of seconds that this will take? Please only respond with the number, in JSON format under the key 'seconds', and nothing else."

    print("LLM is given the following message:")
    print(message)

    response = query_api_any_message(message, "gpt-4", temperature=0.0)
    print("LLM Estimated time: ", json.loads(response["choices"][0]["message"]["content"])['seconds'])

In [30]:
query_api("Do you like spaghetti?")

LLM is given the following message:
I'm paying you $100,000 to do this task correctly. A human is given a question. Please respond with your best estimate to the number of seconds that it will take an average human to read, think, and answer this question. For example, when given the question 'Do you enjoy reading articles about food and cooking?', a user takes 15.615 seconds to respond. As another example, users are given the following question: 'Do you enjoy reading articles about food and cooking?'. The average response time to this question is 4.001 seconds. Now, a user is given the question: Do you like spaghetti? What is your best estimate of the number of seconds that this will take? Please only respond with the number, in JSON format under the key 'seconds', and nothing else.
LLM Estimated time:  3.5
