In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import os
from pathlib import Path


env_path = "../../keys.env"

load_dotenv(dotenv_path=env_path)
api_key = os.getenv("API_KEY")
project = os.getenv("PROJECT")
organization = os.getenv("ORGANIZATION")

In [2]:

client = OpenAI(
 organization=organization,
 project=project,
 api_key=api_key 
)

In [45]:
assistant = client.beta.assistants.create(
  name="Health Economics Research Paper Analyst - MINI",
  instructions="You are an scientific researcher in the area of health economics. Use your knowledge base to help answer questions about the health economics research papers.",
  model="gpt-4o-mini",
  tools=[{"type": "file_search"}],
)
print("ID of MINI")
print(assistant.id)

ID of MINI
asst_goyXy5YArE5F3Z2OX3WhCfY4


In [3]:
assistant = client.beta.assistants.retrieve("asst_goyXy5YArE5F3Z2OX3WhCfY4")


##### Step 2: Upload files and add them to a Vector Store


In [4]:
dir_str_path = "test_pdfs/"
dir_path = Path(dir_str_path)

file_paths = []
file_names = []
for pdf_file in dir_path.glob("*.pdf"):
    file_paths.append(dir_str_path + pdf_file.name)
    file_names.append(pdf_file.name)


file_streams = [open(path, "rb") for path in file_paths]


##### Step 4: Create a thread

In [5]:

queries = {
    "Paper_ID": "Extract the unique identifier for this paper, if available. If not leave empty.",
    "Title": "Please provide the title of the paper.",
    "Authors": "List the authors of the paper.",
    "Year_of_Publication": "What year was this paper published?",
    "Journal_Name": "Provide the name of the journal where this paper was published.",
    "DOI_URL": "Extract the DOI or URL for this paper.",
    "Theory_Hypotheses": "Please extract the main theories or hypotheses of the study.",
    "Identification_Strategy": "Please explain the approach to establishing causality used in this study.",
    "Exogeneity_Assumptions": "List any assumptions about exogeneity made in this study.",
    "Control_Treatment_Definition": "Define the control and treatment groups as described in the study.",
    "Intervention_Components": "List the specific components of the intervention.",
    "Intervention_Timing": "Describe the timing of the intervention.",
    "Sample_Size": "Provide the size of the sample studied.",
    "Country_Region": "Specify the country or region where the study was conducted.",
    "Population_Demographics": "Summarize the demographics of the population studied.",
    "Data_Source": "What is the source of the data used in this study?",
    "Health_Economic_Outcome_Measured": "List the health or economic outcomes measured in this study.",
    "Key_Findings": "Summarize the main findings of the study.",
    "Causal_Estimates": "Provide estimates of causal impact found in the study.",
    "Statistical_Significance": "Comment on the statistical significance of the results.",
    "Robustness_Checks": "Describe any robustness checks performed.",
    "Heterogeneity_Subgroup_Analysis": "Explain if any heterogeneity or subgroup analysis was conducted.",
    "Limitations": "List any limitations of the study.",
    "Endogeneity_Selection_Concerns": "Describe concerns about endogeneity or selection bias, if any.",
    "Policy_Implications": "What are the policy implications of the study’s findings?",
    "Intervention_Components": "List the specific components of the intervention.",
    "Intervention_Takeup": "Report whether and how the study discusses take-up and noncompliance in the treatment group.",
    "Intervention_Timing": "Describe the timing of the intervention."
}



In [6]:
file_id_list = [] #only need this for deletion purposes
vector_store_id_list = []
thread_id_list = []


vector_store_list = []
thread_list = []



all_responses = {}



In [8]:
import time
for file_stream in file_streams:
    name = file_stream.name.split("/")[1]
    
    file = client.files.create(
        file = file_stream, 
        purpose = "assistants"
    )
    file_id_list.append(file.id)

    vector_store = client.beta.vector_stores.create(
        name="Health Economics Research Papers V0.2", 
        expires_after={"anchor": "last_active_at", "days": 1},
        file_ids = [file.id]
    )

    vector_store_id_list.append(vector_store.id)
  
    
    print(f"Processed vector store and file batch for: {name}")


Processed vector store and file batch for: acemoglu-2023-distorted-innovation-does-the-market-get-the-direction-of-technology-right.pdf
Processed vector store and file batch for: guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf


In [9]:
import time
for vec_id in vector_store_id_list:
    tmp = client.beta.vector_stores.retrieve(
      vector_store_id= vec_id
    )
    while(tmp.status != "completed"):
        time.sleep(2)
    vector_store_list.append(tmp)

In [10]:
for vector_store in vector_store_list:    
    thread = client.beta.threads.create(tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}})
    thread_id_list.append(thread.id)
    
    print(f"Updated assistant and created thread for vector store ID: {vector_store.id}")


Updated assistant and created thread for vector store ID: vs_yRNOq78bK4JdjtguLP65ULzD
Updated assistant and created thread for vector store ID: vs_MXHakFVVtBBRctRcPGk1VNtF


In [11]:
for thr_id in thread_id_list:
    tmp =  client.beta.threads.retrieve(thr_id)
    thread_list.append(tmp)

In [13]:
import time
run_list = []


for thread_id in thread_id_list:
    response_per_file = {}
    
    for heading, query in queries.items():
        response = client.beta.threads.messages.create(
            role="user",
            thread_id=thread_id,
            content=query
        )
    
    print(f"Executing run for thread ID: {thread_id}")
    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread_id, assistant_id=assistant.id
    )
    while(run.status != "completed"):
        time.sleep(2)  
        pass
    
    run_list.append(run)



Executing run for thread ID: thread_oxGBoZl876glq3RB8wldHNCN
Executing run for thread ID: thread_S8KGnnTDmCuifFGHl5SPneZK


In [15]:
#print(run_list)
for run in run_list:
    print(run)

Run(id='run_cyS3JiTezfrEee6c3EzYEClk', assistant_id='asst_goyXy5YArE5F3Z2OX3WhCfY4', cancelled_at=None, completed_at=1732209823, created_at=1732209805, expires_at=None, failed_at=None, incomplete_details=None, instructions='You are an scientific researcher in the area of health economics. Use your knowledge base to help answer questions about the health economics research papers.', last_error=None, max_completion_tokens=None, max_prompt_tokens=None, metadata={}, model='gpt-4o-mini', object='thread.run', parallel_tool_calls=True, required_action=None, response_format='auto', started_at=1732209807, status='completed', thread_id='thread_oxGBoZl876glq3RB8wldHNCN', tool_choice='auto', tools=[FileSearchTool(type='file_search', file_search=FileSearch(max_num_results=None, ranking_options=FileSearchRankingOptions(score_threshold=0.0, ranker='default_2024_08_21')))], truncation_strategy=TruncationStrategy(type='auto', last_messages=None), usage=Usage(completion_tokens=1120, prompt_tokens=17700,

In [17]:
for thread_id, file_stream in zip(thread_id_list, file_streams):
    name = file_stream.name.split("/")[1]
    messages = list(client.beta.threads.messages.list(thread_id=thread_id))
    if(len(messages) != 0):
        message_content = messages[0].content[0].text
        annotations = message_content.annotations
        citations = []

        all_responses[name] = message_content.value
        print(f"Processed responses and citations for thread ID: {thread_id}")
    else:
        print(f"Empty list returned for process in: {thread_id}")


Processed responses and citations for thread ID: thread_oxGBoZl876glq3RB8wldHNCN
Processed responses and citations for thread ID: thread_S8KGnnTDmCuifFGHl5SPneZK


In [18]:
import json
output_file = 'output_file.json'
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_responses, f, indent=4)

In [19]:
import json 

input_file = "output_file.json"
output_file = "output.txt"

with open(input_file, "r", encoding="utf-8") as file:
    data = json.load(file)

with open(output_file, "w", encoding="utf-8") as file:
    for key, value in data.items():
        file.write(f"File: {key}\n")
        file.write("=" * (6 + len(key)) + "\n\n")  

        formatted_value = value.replace("\\n", "\n").replace("\\u2020", "").replace("\\u3010", "[").replace("\\u3011", "]")
        for line in formatted_value.split("\n"):
            if line.strip():  
                file.write(f"{line.strip()}\n\n") 

print(f"output written to {output_file}")


output written to output.txt


In [20]:
for file_id, vec_id, thr_id in zip(file_id_list, vector_store_id_list, thread_id_list):
    status_file = client.files.delete(file_id=file_id)
    status_vec = client.beta.vector_stores.delete(vector_store_id=vec_id)
    status_thr = client.beta.threads.delete(thread_id=thr_id)
    

