In [56]:
from openai import OpenAI
from dotenv import load_dotenv
import os
from pathlib import Path


env_path = "../../keys.env"

load_dotenv(dotenv_path=env_path)
api_key = os.getenv("API_KEY")
project = os.getenv("PROJECT")
organization = os.getenv("ORGANIZATION")

In [57]:

client = OpenAI(
 organization=organization,
 project=project,
 api_key=api_key 
)

In [58]:
assistants = client.beta.assistants.list()
for assistant in assistants.data:
    print(f"Assistant ID: {assistant.id}")
    print(f"Name: {assistant.name}")
    print(f"Model: {assistant.model}")
    print(f"Tools: {assistant.tools}")
    print("-" * 30)

Assistant ID: asst_EEaiZaR394wa9UIPrXAX6gr0
Name: Research Paper Analyst
Model: gpt-4o
Tools: [FileSearchTool(type='file_search', file_search=FileSearch(max_num_results=None, ranking_options=FileSearchRankingOptions(score_threshold=0.0, ranker='default_2024_08_21')))]
------------------------------


In [59]:
assistant = client.beta.assistants.retrieve("asst_EEaiZaR394wa9UIPrXAX6gr0")


In [60]:
assistant_id = assistant.id
print(assistant_id)

asst_EEaiZaR394wa9UIPrXAX6gr0


##### Step 2: Upload files and add them to a Vector Store


In [5]:

vector_store = client.beta.vector_stores.create(name="Health Economics Research Papers V0.2", expires_after={"anchor": "last_active_at", "days": 1})


<generator object Path.glob at 0x00000270A6CC5140>

In [61]:

dir_str_path = "test_pdfs/"
dir_path = Path(dir_str_path)

file_paths = []
file_names = []
for pdf_file in dir_path.glob("*.pdf"):
    file_paths.append(dir_str_path + pdf_file.name)
    file_names.append(pdf_file.name)


file_streams = [open(path, "rb") for path in file_paths]


In [94]:
list(dir_path.glob("*.pdf"))

[WindowsPath('test_pdfs/acemoglu-2023-distorted-innovation-does-the-market-get-the-direction-of-technology-right.pdf'),
 WindowsPath('test_pdfs/guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf')]

In [62]:
print(file_streams)


[<_io.BufferedReader name='test_pdfs/acemoglu-2023-distorted-innovation-does-the-market-get-the-direction-of-technology-right.pdf'>, <_io.BufferedReader name='test_pdfs/guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf'>]


In [7]:
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

In [8]:
print(file_batch.status)
# wait till finish!!!!!!

completed


##### Step 3: Update the assistant to use the new Vector Store

In [11]:
assistant = client.beta.assistants.update(
  assistant_id=assistant_id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

##### Step 4: Create a thread

In [97]:

queries = {
    "Paper_ID": "Extract the unique identifier for this paper, if available. If not leave empty.",
    "Title": "Please provide the title of the paper.",
    "Authors": "List the authors of the paper.",
    "Year_of_Publication": "What year was this paper published?",
    "Journal_Name": "Provide the name of the journal where this paper was published.",
    "DOI_URL": "Extract the DOI or URL for this paper.",
    "Theory_Hypotheses": "Please extract the main theories or hypotheses of the study.",
    "Identification_Strategy": "Please explain the approach to establishing causality used in this study.",
    "Exogeneity_Assumptions": "List any assumptions about exogeneity made in this study.",
    "Control_Treatment_Definition": "Define the control and treatment groups as described in the study.",
    "Intervention_Components": "List the specific components of the intervention.",
    # "Intervention_Timing": "Describe the timing of the intervention.",
    # "Sample_Size": "Provide the size of the sample studied.",
    # "Country_Region": "Specify the country or region where the study was conducted.",
    # "Population_Demographics": "Summarize the demographics of the population studied.",
    # "Data_Source": "What is the source of the data used in this study?",
    # "Health_Economic_Outcome_Measured": "List the health or economic outcomes measured in this study.",
    # "Key_Findings": "Summarize the main findings of the study.",
    # "Causal_Estimates": "Provide estimates of causal impact found in the study.",
    # "Statistical_Significance": "Comment on the statistical significance of the results.",
    # "Robustness_Checks": "Describe any robustness checks performed.",
    # "Heterogeneity_Subgroup_Analysis": "Explain if any heterogeneity or subgroup analysis was conducted.",
    # "Limitations": "List any limitations of the study.",
    # "Endogeneity_Selection_Concerns": "Describe concerns about endogeneity or selection bias, if any.",
    # "Policy_Implications": "What are the policy implications of the study’s findings?"
}



In [98]:
# Initialize necessary lists and dictionaries
vector_store_list = []
thread_list = []
response_per_file = {}
all_responses = {}

# Assuming `file_streams` is already defined and contains opened file objects
# Assuming `queries` and `assistant_id` are also defined


In [99]:
for file_stream in file_streams:
    # Extract the file name
    name = file_stream.name.split("/")[1]
    
    # Create a vector store
    vector_store = client.beta.vector_stores.create(
        name="Health Economics Research Papers V0.2", 
        expires_after={"anchor": "last_active_at", "days": 1}
    )
    vector_store_list.append(vector_store)
    
    # Upload and poll the file batch to the vector store
    file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store.id, files=[file_stream]
    )
    
    # Store `name` and `vector_store` for later use in subsequent cells
    print(f"Processed vector store and file batch for: {name}")


Processed vector store and file batch for: acemoglu-2023-distorted-innovation-does-the-market-get-the-direction-of-technology-right.pdf
Processed vector store and file batch for: guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf


In [100]:
print(vector_store_list)

[VectorStore(id='vs_PCyAByajVcahXHjInC5TU4MK', created_at=1731602404, file_counts=FileCounts(cancelled=0, completed=0, failed=0, in_progress=0, total=0), last_active_at=1731602404, metadata={}, name='Health Economics Research Papers V0.2', object='vector_store', status='completed', usage_bytes=0, expires_after=ExpiresAfter(anchor='last_active_at', days=1), expires_at=1731688804), VectorStore(id='vs_T1QQjLp1o3pEBnnGhTXP99Gs', created_at=1731602410, file_counts=FileCounts(cancelled=0, completed=0, failed=0, in_progress=0, total=0), last_active_at=1731602410, metadata={}, name='Health Economics Research Papers V0.2', object='vector_store', status='completed', usage_bytes=0, expires_after=ExpiresAfter(anchor='last_active_at', days=1), expires_at=1731688810)]


In [101]:
for vector_store in vector_store_list:
    # Update assistant with the vector store ID
    # assistant = client.beta.assistants.update(
    #     assistant_id=assistant_id,
    #     tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}}
    # )
    
    # Create a thread and add it to the thread list
    thread = client.beta.threads.create(tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}})
    thread_list.append(thread.id)
    
    print(f"Updated assistant and created thread for vector store ID: {vector_store.id}")


Updated assistant and created thread for vector store ID: vs_PCyAByajVcahXHjInC5TU4MK
Updated assistant and created thread for vector store ID: vs_T1QQjLp1o3pEBnnGhTXP99Gs


In [102]:
print(thread_list)

['thread_Ie8zUiD1hdlCCYUv0ZBEU01t', 'thread_moJ0lhBGfUszhhYX4SUYYgmL']


In [103]:
run_list = []
for thread_id in thread_list:
    response_per_file = {}
    
    for heading, query in queries.items():
        # Send the query message to the thread
        response = client.beta.threads.messages.create(
            role="user",
            thread_id=thread_id,
            content=query
        )
       # response_per_file[heading] = response.content
    
    # Execute the run and poll for completion
    print(f"Executing run for thread ID: {thread_id}")
    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread_id, assistant_id=assistant.id
    )
    run_list.append(run)
    if run.status == "completed":
        print(f"Run completed for thread ID: {thread_id}")


Executing run for thread ID: thread_Ie8zUiD1hdlCCYUv0ZBEU01t
Executing run for thread ID: thread_moJ0lhBGfUszhhYX4SUYYgmL


In [108]:
#print(run_list)
for run in run_list:
    print(run)

Run(id='run_44YhA5zCUQzebHSlvXOqS8kl', assistant_id='asst_EEaiZaR394wa9UIPrXAX6gr0', cancelled_at=None, completed_at=None, created_at=1731602450, expires_at=None, failed_at=1731602458, incomplete_details=None, instructions='You are an scientific researcher in the area of health economics. Use your knowledge base to help answer questions about the health economics research papers.', last_error=LastError(code='rate_limit_exceeded', message='Request too large for gpt-4o in organization org-ZImU13r27Api2b0n9bqJuyeN on tokens per min (TPM): Limit 30000, Requested 33376. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.'), max_completion_tokens=None, max_prompt_tokens=None, metadata={}, model='gpt-4o', object='thread.run', parallel_tool_calls=True, required_action=None, response_format='auto', started_at=1731602451, status='failed', thread_id='thread_Ie8zUiD1hdlCCYUv0ZBEU01t', tool_choice='auto', tool

In [None]:
for thread_id, run_id in zip(thread_list, run_list):
    # Retrieve the messages after the run has completed
    messages = list(client.beta.threads.messages.list(thread_id=thread_id, run_id=run.id))
    
    # Process the message content, annotations, and citations
    message_content = messages[0].content[0].text
    annotations = message_content.annotations
    citations = []
    
    for index, annotation in enumerate(annotations):
        message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
        if file_citation := getattr(annotation, "file_citation", None):
            cited_file = client.files.retrieve(file_citation.file_id)
            citations.append(f"[{index}] {cited_file.filename}")
    
    # Store the final content in `all_responses`
    all_responses[name] = message_content.value
    print(f"Processed responses and citations for thread ID: {thread_id}")


In [None]:
# import json

# output_file = "multiple_files_test01.json"

# thread_list = []
# vector_store_list = []
# all_responses = {}
# response_per_file = {}
# for file_stream in file_streams:

#     name = file_stream.name.split("/")[1]
#     vector_store = client.beta.vector_stores.create(name="Health Economics Research Papers V0.2", expires_after={"anchor": "last_active_at", "days": 1})
#     vector_store_list.append(vector_store)
    
#     file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
#         vector_store_id=vector_store.id, files=[file_stream]
#     )
    
#     if(file_batch.status == "completed"):
#         assistant = client.beta.assistants.update(
#             assistant_id=assistant_id,
#             tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
#         )

#         thread = client.beta.threads.create(tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}})
#         thread_list.append(thread.id)
#         for heading, query in queries.items():
#             response = client.beta.threads.messages.create(
#             role= "user",
#             thread_id=thread.id,  # Pass the thread_id to maintain context
#             content= "In the research paper " + str(name) + ": " + str(query)
#             )
#             response_per_file[heading] = response.content
        
#         print("Executing run for: " + name)
#         run = client.beta.threads.runs.create_and_poll(
#             thread_id=thread.id, assistant_id=assistant.id
#         )
#         if(run.status == "completed"):
#             messages = list(client.beta.threads.messages.list(thread_id = thread.id, run_id = run.id))

#             message_content = messages[0].content[0].text
#             annotations = message_content.annotations
#             citations = []
#             for index, annotation in enumerate(annotations):
#                 message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
#                 if file_citation := getattr(annotation, "file_citation", None):
#                     cited_file = client.files.retrieve(file_citation.file_id)
#                     citations.append(f"[{index}] {cited_file.filename}")
            
#             all_responses[name] = message_content.value
    


In [55]:
print(thread_list)
my_thread = client.beta.threads.retrieve(thread_list[0])

thread_messages = list(client.beta.threads.messages.list(my_thread.id))

message_content = thread_messages[0].content[0].text

print(thread_messages)


['thread_6EJeuip9wtmTdhouOtHNUTLf']
[Message(id='msg_lbGnpncEJj3Dt46gCvjogW4C', assistant_id=None, attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='In the research paper acemoglu-2023-distorted-innovation-does-the-market-get-the-direction-of-technology-right.pdf: What are the policy implications of the study’s findings?'), type='text')], created_at=1731598628, incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='user', run_id=None, status=None, thread_id='thread_6EJeuip9wtmTdhouOtHNUTLf'), Message(id='msg_PTxOIYgIuvqsKATdkoQfZMsZ', assistant_id=None, attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='In the research paper acemoglu-2023-distorted-innovation-does-the-market-get-the-direction-of-technology-right.pdf: Describe concerns about endogeneity or selection bias, if any.'), type='text')], created_at=1731598628, incomplete_at=None, incomplete_details=None, met

In [36]:
print(message_content)


Text(annotations=[], value='In the research paper acemoglu-2023-distorted-innovation-does-the-market-get-the-direction-of-technology-right.pdf: What are the policy implications of the study’s findings?')


In [38]:
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_responses, f, indent=4)