In [2]:
from openai import OpenAI
from pydantic import BaseModel
import functions.prompts as prompts
import functions.llm as llm
import functions.fetch as fetch
import pandas as pd
import json
from openai.lib._parsing._completions import type_to_response_format_param

import importlib
importlib.reload(llm)
importlib.reload(prompts)
importlib.reload(fetch)

client = OpenAI()

In [3]:
class Note(BaseModel):
    type: str
    description: str

class AbstractSummary(BaseModel):
    keywords: list[str]
    summaries: list[str]
    note: Note

In [5]:
df = pd.read_csv("./data/abstract/abstract.csv")
df_ref = pd.DataFrame(columns=["id", "ref_id"])

reqs = []
ref = []
for i, row in df.iterrows():
    reqs.append(llm.wrap(f"paper-{i}", {
        "model": "o3-mini",
        "messages": [{
            "role": "system",
            "content": prompts.findings
        }, {
            "role": "user",
            "content": row.title + "\n" + row.abstract
        }],
        "response_format": type_to_response_format_param(AbstractSummary)
    }))
    df_ref.loc[len(df_ref)] = [row.paperId, f"paper-{i}"]

req_path = "data/findings/req.jsonl"
llm.gen_batch_jsonl(req_path, reqs)
df_ref.to_csv('data/findings/ref.csv', index=False)

In [None]:
batch = llm.gen_batch(client, req_path, metadata="Atlas of Human-AI Interaction")

In [None]:
print(f"File ID: {batch['batch_file']}")
print(f"Batch ID: {batch['batch_object']}")

fetch.save(f"File ID: {batch['batch_file']} \nBatch ID: {batch['batch_object']}", "data/findings/req_file_id.txt")

In [None]:
# batch = {
#     "batch_object": ""
# }
client.batches.retrieve(batch['batch_object'])

In [None]:
res_path = "data/findings/res.jsonl"
output_file_id = llm.get_batch_result(client, batch['batch_object'], res_path)

In [None]:
import pandas as pd

f =  open(res_path, "r")
res = f.read()
f.close()

df = pd.DataFrame(columns=["paper-id", "keywords", "summaries", "notes"])
df_findings = pd.DataFrame(columns=["paper-id", "finding"])

for i in res.split("\n")[:-1]:
    line = json.loads(i)
    id = line['custom_id']
    content = AbstractSummary.model_validate_json(line['response']['body']['choices'][0]['message']['content'])
    df.loc[len(df)] = [id, str(content.keywords), str(content.summaries), str(content.note)]
    for finding in content.summaries:
        df_findings.loc[len(df_findings)] = [id, finding]

In [None]:
df.to_csv('data/findings/res.csv', index=False)
df_findings.to_csv('data/findings/findings.csv', index=False)