In [None]:
# 240630
from glob import glob
import pickle
from openai  import OpenAI
import os
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor
from itertools import product
import json
from torch.utils.data import Dataset, DataLoader
from format import *

client = OpenAI(api_key="")

In [2]:
class ModelResponder:
    def __init__(self, model_path, ques_path_list, context_list, prompt_func_list=None, path=None):
        self.batch_size = 350
        self.model_path = model_path
        if path is not None:
            self.path = path
        else:
            self.path = os.path.basename(model_path)

        if not prompt_func_list:
            default_prompt_func = lambda inst,ques: f"{ques}"
            prompt_func_list = [default_prompt_func]
        self.prompt_func_list = prompt_func_list

        indexed_ques_paths = list(enumerate(ques_path_list))
        indexed_prompt_funcs = list(enumerate(self.prompt_func_list))
        indexed_contexts = list(enumerate(context_list))
        self.combinations = list(product(indexed_ques_paths, indexed_prompt_funcs, indexed_contexts))

    def process_and_update(self, ques_dict, inst, prompt_func, batch_pbar):
        max_retries = 5
        retries = 0
        ques=ques_dict['input']
        while retries < max_retries:
            try:
                response = client.chat.completions.create(
                    model=self.model_path,
                    messages=[
                        {"role": "system", "content": inst},
                        {"role": "user", "content": ques}
                    ],
                    temperature=0.0000000000000000000000000000000000000000001,
                    top_p=0.0000000000000000000000000000000000000000001,
                    seed=100,
                    )
                batch_pbar.update(1)
                output = response.choices[0].message.content
                ques_dict['response'] = output
                return ques_dict
            except Exception as e:
                current = f"{ques_dict['year']}-{ques_dict['session']}-{ques_dict['question_number']}"
                retries += 1
                print(e)
                print(f"Retrying '{current}'... {retries}/{max_retries}")
                time.sleep(60)
                if retries == max_retries:
                    print(f"Failed to generate response for '{current}', skipping...")
                    ques_dict['response'] = None
                    return ques_dict
                
    def process_files(self):
        for (ques_idx, ques_path), (prompt_idx, prompt_func), (context_idx, inst) in self.combinations:
            with open(ques_path) as f:
                exam = json.load(f)
            filename=f'output/{self.path} [f{prompt_idx+1}_p{context_idx+1}_q{ques_idx+1}].json'
            dataset = ExamDataset(exam, inst, prompt_func)
            dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn)
            with tqdm(total=len(dataloader),desc=filename, leave=True, position=1) as pbar:
                for i, batch in enumerate(dataloader):
                    with ThreadPoolExecutor() as executor:
                        with tqdm(total=len(batch), desc=f"Batch {i}", leave=True, position=0) as batch_pbar:
                            results = list(executor.map(lambda ques: self.process_and_update(ques, inst, prompt_func, batch_pbar), batch))
                    if os.path.exists(filename):
                        with open(filename, 'r') as file:
                            resp = json.load(file)
                    else:
                        resp = []
                        os.makedirs(os.path.dirname(filename), exist_ok=True)
                    for result in results:
                        resp.append(result)
                    with open(filename, 'w', encoding='utf-8') as f:
                        json.dump(resp, f, indent=4, ensure_ascii=False)
                    pbar.update(1)

In [3]:
model_path="gpt-4o-2024-05-13"

run = ModelResponder(model_path, exam_list, inst_list)
run.process_files()

Batch 0: 100%|██████████| 350/350 [00:09<00:00, 38.27it/s] 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:24<00:00, 14.06it/s] 1/6 [00:09<00:45,  9.16s/it]
Batch 2: 100%|██████████| 350/350 [00:09<00:00, 36.10it/s] 2/6 [00:34<01:13, 18.42s/it]
Batch 3: 100%|██████████| 350/350 [00:12<00:00, 27.17it/s] 3/6 [00:43<00:43, 14.44s/it]
Batch 4: 100%|██████████| 350/350 [00:10<00:00, 32.47it/s] 4/6 [00:56<00:27, 13.84s/it]
Batch 5: 100%|██████████| 350/350 [00:16<00:00, 21.13it/s] 5/6 [01:07<00:12, 12.75s/it]
output/gpt-4o-2024-05-13 [f1_p1_q1].json: 100%|██████████| 6/6 [01:24<00:00, 14.02s/it]
Batch 0: 100%|██████████| 350/350 [00:20<00:00, 17.49it/s] 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:09<00:00, 37.36it/s] 1/6 [00:20<01:40, 20.02s/it]
Batch 2: 100%|██████████| 350/350 [00:08<00:00, 38.90it/s] 2/6 [00:29<00:55, 13.76s/it]
Batch 3: 100%|██████████| 350/350 [00:08<00:00, 39.85it/s] 3/6 [00:38<00:34, 11.60s/it]
Batch 4: 100%|██████████| 350/350 [00:09<00:00, 

In [4]:
model_path="gpt-4-turbo-2024-04-09"

run = ModelResponder(model_path, exam_list, inst_list)
run.process_files()

Batch 0: 100%|██████████| 350/350 [00:19<00:00, 17.50it/s]    | 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:11<00:00, 29.74it/s]    | 1/6 [00:20<01:40, 20.01s/it]
Batch 2: 100%|██████████| 350/350 [00:18<00:00, 18.65it/s]    | 2/6 [00:31<01:00, 15.17s/it]
Batch 3: 100%|██████████| 350/350 [00:12<00:00, 26.99it/s]    | 3/6 [00:50<00:50, 16.83s/it]
Batch 4: 100%|██████████| 350/350 [00:11<00:00, 29.59it/s]▋   | 4/6 [01:03<00:30, 15.32s/it]
Batch 5: 100%|██████████| 350/350 [00:24<00:00, 14.44it/s]██▎ | 5/6 [01:15<00:14, 14.07s/it]
output/gpt-4-turbo-2024-04-09 [f1_p1_q1].json: 100%|██████████| 6/6 [01:39<00:00, 16.63s/it]
Batch 0: 100%|██████████| 350/350 [00:11<00:00, 29.21it/s]    | 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:11<00:00, 29.31it/s]    | 1/6 [00:11<00:59, 11.99s/it]
Batch 2: 100%|██████████| 350/350 [00:11<00:00, 29.42it/s]    | 2/6 [00:23<00:47, 11.97s/it]
Batch 3: 100%|██████████| 350/350 [00:23<00:00, 14.72it/s]    | 3/6 [00:35<00:35, 11.9

In [5]:
model_path="gpt-3.5-turbo-0125"

run = ModelResponder(model_path, exam_list, inst_list)
run.process_files()

Batch 0: 100%|██████████| 350/350 [00:11<00:00, 29.74it/s]| 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:10<00:00, 33.64it/s]| 1/6 [00:11<00:58, 11.78s/it]
Batch 2: 100%|██████████| 350/350 [00:10<00:00, 33.36it/s]| 2/6 [00:22<00:43, 10.98s/it]
Batch 3: 100%|██████████| 350/350 [00:09<00:00, 35.14it/s]| 3/6 [00:32<00:32, 10.77s/it]
Batch 4: 100%|██████████| 350/350 [00:09<00:00, 35.15it/s]| 4/6 [00:42<00:20, 10.46s/it]
Batch 5: 100%|██████████| 350/350 [00:10<00:00, 34.91it/s]| 5/6 [00:52<00:10, 10.29s/it]
output/gpt-3.5-turbo-0125 [f1_p1_q1].json: 100%|██████████| 6/6 [01:02<00:00, 10.46s/it]
Batch 0: 100%|██████████| 350/350 [00:09<00:00, 36.76it/s]| 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:09<00:00, 37.08it/s]| 1/6 [00:09<00:47,  9.53s/it]
Batch 2: 100%|██████████| 350/350 [00:10<00:00, 34.96it/s]| 2/6 [00:18<00:37,  9.49s/it]
Batch 3: 100%|██████████| 350/350 [00:09<00:00, 36.71it/s]| 3/6 [00:29<00:29,  9.74s/it]
Batch 4: 100%|██████████| 350/350 [00

In [6]:
model_path="gpt-4-0613"
run = ModelResponder(model_path, exam_list, inst_list)
run.process_files()

Batch 0: 100%|██████████| 350/350 [00:17<00:00, 19.66it/s]0:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:16<00:00, 21.03it/s]0:17<01:29, 17.81s/it]
Batch 2: 100%|██████████| 350/350 [00:17<00:00, 19.70it/s]0:34<01:08, 17.14s/it]
Batch 3: 100%|██████████| 350/350 [00:16<00:00, 20.79it/s]0:52<00:52, 17.43s/it]
Batch 4: 100%|██████████| 350/350 [00:24<00:00, 14.53it/s]1:09<00:34, 17.21s/it]
Batch 5: 100%|██████████| 350/350 [00:17<00:00, 20.37it/s]1:33<00:19, 19.70s/it]
output/gpt-4-0613 [f1_p1_q1].json: 100%|██████████| 6/6 [01:50<00:00, 18.42s/it]
Batch 0: 100%|██████████| 350/350 [00:15<00:00, 22.18it/s]0:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:14<00:00, 24.23it/s]0:15<01:18, 15.79s/it]
Batch 2: 100%|██████████| 350/350 [00:14<00:00, 23.92it/s]0:30<01:00, 15.01s/it]
Batch 3: 100%|██████████| 350/350 [00:15<00:00, 23.28it/s]0:44<00:44, 14.85s/it]
Batch 4: 100%|██████████| 350/350 [00:15<00:00, 22.65it/s]0:59<00:29, 14.93s/it]
Batch 5: 100%|██████████| 350/350 [00:14<00:

In [7]:
model_path="gpt-3.5-turbo-0613"
run = ModelResponder(model_path, exam_list, inst_list)
run.process_files()

Batch 0: 100%|██████████| 350/350 [00:13<00:00, 26.59it/s]| 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:12<00:00, 28.77it/s]| 1/6 [00:13<01:05, 13.17s/it]
Batch 2: 100%|██████████| 350/350 [00:12<00:00, 27.35it/s]| 2/6 [00:25<00:50, 12.59s/it]
Batch 3: 100%|██████████| 350/350 [00:23<00:00, 14.64it/s]| 3/6 [00:38<00:38, 12.69s/it]
Batch 4: 100%|██████████| 350/350 [00:12<00:00, 27.45it/s]| 4/6 [01:02<00:34, 17.13s/it]
Batch 5: 100%|██████████| 350/350 [00:13<00:00, 26.16it/s]| 5/6 [01:14<00:15, 15.56s/it]
output/gpt-3.5-turbo-0613 [f1_p1_q1].json: 100%|██████████| 6/6 [01:28<00:00, 14.72s/it]
Batch 0: 100%|██████████| 350/350 [00:12<00:00, 27.23it/s]| 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:13<00:00, 26.24it/s]| 1/6 [00:12<01:04, 12.87s/it]
Batch 2: 100%|██████████| 350/350 [00:11<00:00, 30.94it/s]| 2/6 [00:26<00:52, 13.16s/it]
Batch 3: 100%|██████████| 350/350 [00:13<00:00, 26.25it/s]| 3/6 [00:37<00:36, 12.33s/it]
Batch 4: 100%|██████████| 350/350 [00

In [8]:
# model_path="gpt-4-0314"
# run = ModelResponder(model_path, ques_path_list, inst_list)
# run.process_files()

In [9]:
model_path="gpt-3.5-turbo-0301"
run = ModelResponder(model_path, exam_list, inst_list)
run.process_files()

Batch 0: 100%|██████████| 350/350 [00:09<00:00, 36.84it/s]| 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:09<00:00, 36.80it/s]| 1/6 [00:09<00:47,  9.51s/it]
Batch 2: 100%|██████████| 350/350 [00:08<00:00, 39.28it/s]| 2/6 [00:19<00:38,  9.52s/it]
Batch 3: 100%|██████████| 350/350 [00:08<00:00, 39.41it/s]| 3/6 [00:27<00:27,  9.25s/it]
Batch 4: 100%|██████████| 350/350 [00:09<00:00, 35.04it/s]| 4/6 [00:36<00:18,  9.12s/it]
Batch 5: 100%|██████████| 350/350 [00:09<00:00, 36.12it/s]| 5/6 [00:46<00:09,  9.44s/it]
output/gpt-3.5-turbo-0301 [f1_p1_q1].json: 100%|██████████| 6/6 [00:56<00:00,  9.44s/it]
Batch 0: 100%|██████████| 350/350 [00:08<00:00, 42.02it/s]| 0/6 [00:00<?, ?it/s]
Batch 1: 100%|██████████| 350/350 [00:08<00:00, 42.31it/s]| 1/6 [00:08<00:41,  8.34s/it]
Batch 2: 100%|██████████| 350/350 [00:08<00:00, 41.71it/s]| 2/6 [00:16<00:33,  8.31s/it]
Batch 3: 100%|██████████| 350/350 [00:09<00:00, 37.86it/s]| 3/6 [00:25<00:25,  8.36s/it]
Batch 4: 100%|██████████| 350/350 [00

In [3]:
model_path="gpt-4-1106-preview"
run = ModelResponder(model_path, exam_list, inst_list)
run.process_files()

Batch 0: 100%|██████████| 350/350 [00:17<00:00, 19.67it/s]
Batch 1: 100%|██████████| 350/350 [00:16<00:00, 21.27it/s]
Batch 2: 100%|██████████| 350/350 [00:16<00:00, 20.59it/s]
Batch 3: 100%|██████████| 350/350 [00:17<00:00, 20.11it/s]
Batch 4: 100%|██████████| 350/350 [00:16<00:00, 20.63it/s]
Batch 5: 100%|██████████| 350/350 [00:26<00:00, 13.27it/s]
output/gpt-4-1106-preview [f1_p1_q1].json: 100%|██████████| 6/6 [01:52<00:00, 18.69s/it]
Batch 0: 100%|██████████| 350/350 [00:15<00:00, 22.11it/s]
Batch 1: 100%|██████████| 350/350 [00:23<00:00, 14.79it/s]
Batch 2: 100%|██████████| 350/350 [00:23<00:00, 14.91it/s]
Batch 3: 100%|██████████| 350/350 [00:15<00:00, 22.74it/s]
Batch 4: 100%|██████████| 350/350 [00:15<00:00, 22.15it/s]
Batch 5: 100%|██████████| 350/350 [00:17<00:00, 19.86it/s]
output/gpt-4-1106-preview [f1_p1_q2].json: 100%|██████████| 6/6 [01:51<00:00, 18.66s/it]
