In [1]:
import json
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import pickle

In [3]:
FILE_PATH = 'annotations_gpt-4/annotations_gpt-4'

with open(f'{FILE_PATH}/experiment_type_to_prolific_id.json') as file:
    experiment_types = json.load(file)

website_preferences = experiment_types['website_preferences']

query_to_time = []
for query_type in website_preferences:
    print("QUERY TYPE:", query_type)
    query_avgs = []

    if query_type == "Non-interactive":
        continue

    for i, prolific_id in enumerate(website_preferences[query_type]):
        if (i == 0): continue
        with open(f'{FILE_PATH}/{prolific_id}.json') as file:
            person_data = json.load(file)

        history = person_data['conversation_history']

        assistant_message = None
        for item in history:
            if item['sender'] == 'user':
                if assistant_message is None:
                    continue
                time_spent = item['time_spent_ms'] / 1000
                query_to_time.append({
                    "question": assistant_message,
                    "response_time": time_spent,
                    "prolific_id": prolific_id
                })
                assistant_message = None
            else: # item['sender'] == 'assistant'
                assistant_message = item['message']

with open('query_to_time_dataset.pkl', 'wb') as file:
    pickle.dump(query_to_time, file)


QUERY TYPE: Supervised Learning
QUERY TYPE: Non-interactive
QUERY TYPE: Pool-based Active Learning
QUERY TYPE: Generative edge cases
QUERY TYPE: Generative yes/no questions
QUERY TYPE: Generative open-ended questions


### Encoding all sentences


In [8]:
import pickle
from sentence_transformers import SentenceTransformer
import torch
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

  from tqdm.autonotebook import tqdm, trange


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache_dir = "C:\\LLMs"
sentence_model = SentenceTransformer('all-mpnet-base-v2', cache_folder = cache_dir)
# sentence_model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True, cache_folder = cache_dir)



In [13]:
with open('query_to_time_dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

print(dataset)



In [15]:
for i, data in tqdm(enumerate(dataset)):
    dataset[i]['embedding'] = sentence_model.encode(data['question'])

3013it [02:41, 18.71it/s]


In [16]:
with open('query_to_time_embedding_dataset.pkl', 'wb') as file:
    pickle.dump(dataset, file)