In [1]:
import openai
from tenacity import (
    retry,
    wait_random_exponential,
)
import json
import pickle
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import random

from open_ai_key import API_KEY

openai.api_key = API_KEY

  from tqdm.autonotebook import tqdm, trange


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache_dir = "C:\\LLMs"
sentence_model = SentenceTransformer('all-mpnet-base-v2', cache_folder = cache_dir)
# sentence_model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True, cache_folder = cache_dir)



In [3]:
@retry(wait=wait_random_exponential(min=1, max=60))
def query_api_any_message(message, engine, **kwargs):
    if "temperature" not in kwargs:
        kwargs["temperature"] = 0.0
    if engine == "gpt-4" or engine == "gpt-3.5-turbo":
        message_dict = [{"role": "user", "content": message}]
        response = openai.ChatCompletion.create(
            model=engine,
            messages=message_dict,
            **kwargs
        )
    else:
        response = openai.Completion.create(
            engine=engine,
            prompt=message,
            **kwargs
        )
    return response

In [4]:
with open('query_to_time_embedding_dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

random.shuffle(dataset)
l = len(dataset)
test_len = round(0.1*l)
test_dataset = dataset[-1*test_len:]

dataset = dataset[0:-1*test_len]

dataset_no_dupes = set([(data['question'], tuple(data['embedding']/np.linalg.norm(data['embedding']))) for data in dataset])
dataset_no_dupes = list(dataset_no_dupes)
dataset_no_dupes_cleaned = []
for d in dataset_no_dupes:
    data = {'question': d[0], 'embedding': np.array(d[1])}
    all_times = []
    for datapoint in dataset:
        if datapoint['question'] == data['question']:
            all_times.append(datapoint['response_time'])
    data['response_time'] = np.mean(all_times)
    data['stdev'] = np.std(all_times)
    dataset_no_dupes_cleaned.append(data)

print(len(dataset_no_dupes_cleaned))

1158


In [5]:
def query_api(question, verbose=False):
    embedding = sentence_model.encode(question)
    embedding = embedding / np.linalg.norm(embedding)
    top_k = []
    for data in dataset_no_dupes_cleaned:
        d = data.copy()
        d['sim_score'] = embedding.T @ d['embedding']
        top_k.append(d)

    top_k.sort(key=lambda x: x['sim_score'], reverse=True)

    message = "I'm paying you $100,000 to do this task correctly. A human is given a question. Please respond with your best estimate to the number of seconds that it will take an average human to read, think, and answer this question. "
    message += f"For example, when given the question '{top_k[0]['question']}', a user takes on average {top_k[0]['response_time']} seconds to respond with a standard deviation of {top_k[0]['stdev']} seconds. "
    message += f"As another example, users are given the following question: '{top_k[1]['question']}'. "
    message += f"The average response time to this question is {top_k[1]['response_time']} seconds with a standard deviation of {top_k[1]['stdev']} seconds. "
    message += f"Now, a user is given the question: '{question}' "
    message += " What is your best estimate of the number of seconds that this will take? Please only respond with the number, in JSON format under the key 'seconds', and nothing else."

    response = query_api_any_message(message, "gpt-4", temperature=0.0)
    if verbose:
        print("LLM is given the following message:")
        print(message)
        print("LLM Estimated time: ", json.loads(response["choices"][0]["message"]["content"])['seconds'])


    return json.loads(response["choices"][0]["message"]["content"])['seconds']

In [6]:
query_api("Do you like spaghetti?", verbose=True)

LLM is given the following message:
I'm paying you $100,000 to do this task correctly. A human is given a question. Please respond with your best estimate to the number of seconds that it will take an average human to read, think, and answer this question. For example, when given the question 'Do you enjoy reading articles about food and cooking?', a user takes on average 6.6248571428571426 seconds to respond with a standard deviation of 5.1027627370886455 seconds. As another example, users are given the following question: 'Do you like reading articles about food and cooking recipes?'. The average response time to this question is 2.721 seconds with a standard deviation of 0.0 seconds. Now, a user is given the question: 'Do you like spaghetti?'  What is your best estimate of the number of seconds that this will take? Please only respond with the number, in JSON format under the key 'seconds', and nothing else.
LLM Estimated time:  2.5


2.5

### Evaluation

In [9]:
print(test_dataset[0].keys())

dict_keys(['question', 'response_time', 'prolific_id', 'embedding'])


In [11]:
mse_loss = 0
accurate = 0
for data in test_dataset:
    true = data['response_time']
    pred = query_api(data['question'])
    data['LLM_prediction'] = pred
    mse_loss += (true - pred)**2
    if abs(true - pred) < 5:
        accurate += 1

avg_mse_loss = mse_loss / len(test_dataset)
print("Avg MSE:", avg_mse_loss)
print("Accuracy:", accurate/len(test_dataset))

Avg MSE: 338.36004851162517
Accuracy: 0.4584717607973422


In [13]:
good = 0
for i in range(len(test_dataset)):
    for j in range(i+1, len(test_dataset)):
        data1 = test_dataset[i]
        data2 = test_dataset[j]
        if (data1['LLM_prediction'] - data2['LLM_prediction'])*(data1['response_time'] - data2['response_time']) >= 0:
            good += 1

print(good/(len(test_dataset)*(len(test_dataset)-1)/2))

0.6539313399778516


In [17]:
with open('LLM_test_dataset.pkl', 'wb') as file:
    pickle.dump(test_dataset, file)

with open('LLM_nearest_neighbors_dataset.pkl', 'wb') as file:
    pickle.dump(dataset_no_dupes_cleaned, file)

Testing the MLP Effort model

In [71]:
import os
import sys
from IPython import get_ipython

# print(sys.path)
# notebook_path = os.path.abspath(get_ipython().get_ipython().magic('pwd'))
# path = os.path.join(os.path.dirname(notebook_path), '..')
# sys.path.append(os.path.dirname(notebook_path))

from effort_model_class import ResponseTimePredictor

print(sys.path)

effort_model = ResponseTimePredictor(sentence_model.get_sentence_embedding_dimension())
print(sentence_model.get_sentence_embedding_dimension())
effort_model.load_state_dict(torch.load("model_state_dict.pth"))
effort_model.to(device)
effort_model.eval()

['c:\\Users\\georg\\OneDrive\\Documents\\George MIT\\UROP\\CLEAR\\github\\generative-elicitation\\effort_model', 'c:\\Users\\georg\\AppData\\Local\\Programs\\Python\\Python312\\python312.zip', 'c:\\Users\\georg\\AppData\\Local\\Programs\\Python\\Python312\\DLLs', 'c:\\Users\\georg\\AppData\\Local\\Programs\\Python\\Python312\\Lib', 'c:\\Users\\georg\\AppData\\Local\\Programs\\Python\\Python312', '', 'C:\\Users\\georg\\AppData\\Roaming\\Python\\Python312\\site-packages', 'c:\\Users\\georg\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages', 'c:\\Users\\georg\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\win32', 'c:\\Users\\georg\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\georg\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\Pythonwin', 'c:\\Users\\georg\\OneDrive\\Documents\\George MIT\\UROP\\CLEAR\\github\\generative-elicitation\\..', 'c:\\Users\\georg\\OneDrive\\Documents\\George MIT\\U

  effort_model.load_state_dict(torch.load("model_state_dict.pth"))


ResponseTimePredictor(
  (mlp): Sequential(
    (0): Linear(in_features=768, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [72]:
mse_loss = 0
accurate = 0
for data in test_dataset:
    true = data['response_time']
    embeddings = sentence_model.encode(data['question'], convert_to_tensor=True).to(device)
    pred = effort_model(embeddings).item()
    mse_loss += (true - pred)**2
    if abs(true - pred) < 5:
        accurate += 1

avg_mse_loss = mse_loss / len(test_dataset)
print("Avg MSE:", avg_mse_loss)
print("Accuracy:", accurate/len(test_dataset))

Avg MSE: 184.50002308515695
Accuracy: 0.45514950166112955
