##### Step 1: Create a new Assistant with File Search Enabled

In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import os

env_path = "../../keys.env"

load_dotenv(dotenv_path=env_path)
api_key = os.getenv("API_KEY")
project = os.getenv("PROJECT")
organization = os.getenv("ORGANIZATION")


In [2]:

 
client = OpenAI(
 organization=organization,
 project=project,
 api_key=api_key 
)



In [3]:
assistants = client.beta.assistants.list()
for assistant in assistants.data:
    print(f"Assistant ID: {assistant.id}")
    print(f"Name: {assistant.name}")
    print(f"Model: {assistant.model}")
    print(f"Tools: {assistant.tools}")
    print("-" * 30)

Assistant ID: asst_EEaiZaR394wa9UIPrXAX6gr0
Name: Research Paper Analyst
Model: gpt-4o
Tools: [FileSearchTool(type='file_search', file_search=FileSearch(max_num_results=None, ranking_options=FileSearchRankingOptions(score_threshold=0.0, ranker='default_2024_08_21')))]
------------------------------


In [4]:
assistant = client.beta.assistants.retrieve("asst_EEaiZaR394wa9UIPrXAX6gr0")


In [None]:
# no need to create it anymore
 
assistant = client.beta.assistants.create(
  name="Health Economics Research Paper Analyst",
  instructions="You are an scientific researcher in the area of health economics. Use your knowledge base to help answer questions about the health economics research papers.",
  model="gpt-4o",
  tools=[{"type": "file_search"}],
)

##### Step 2: Upload files and add them to a Vector Store

In [5]:
vector_store = client.beta.vector_stores.create(name="Health Economics Research Papers", expires_after={"anchor": "last_active_at", "days": 1})
 
file_paths = ["test_pdfs/guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf"]
file_streams = [open(path, "rb") for path in file_paths]
 

file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)
 
print(file_batch.status)
print(file_batch.file_counts)

completed
FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1)


In [6]:
print(file_batch.status)
# wait till finish!!!!!!

completed


.upload_and_poll():
1. Upload the file: The function uploads a file to the OpenAI vector store. The file could be in-memory (as bytes) or from a specified path.
2. Poll for completion: After uploading, it checks (polls) whether the file has been successfully processed and indexed. This is useful in cases where the processing takes time, and you want to wait until the file is fully ready for use in tasks like document search or interaction with an assistant.

##### Step 3: Update the assistant to use the new Vector Store

In [8]:
assistant_id = "asst_EEaiZaR394wa9UIPrXAX6gr0"


In [9]:
assistant = client.beta.assistants.update(
  assistant_id=assistant_id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

##### Step 4: Create a thread

In [10]:

queries = {
    "Paper_ID": "Extract the unique identifier for this paper, if available. If not leave empty.",
    "Title": "Please provide the title of the paper.",
    "Authors": "List the authors of the paper.",
    "Year_of_Publication": "What year was this paper published?",
    "Journal_Name": "Provide the name of the journal where this paper was published.",
    "DOI_URL": "Extract the DOI or URL for this paper.",
    "Theory_Hypotheses": "Please extract the main theories or hypotheses of the study.",
    "Identification_Strategy": "Please explain the approach to establishing causality used in this study.",
    "Exogeneity_Assumptions": "List any assumptions about exogeneity made in this study.",
    "Control_Treatment_Definition": "Define the control and treatment groups as described in the study.",
    "Intervention_Components": "List the specific components of the intervention.",
    "Intervention_Timing": "Describe the timing of the intervention.",
    "Sample_Size": "Provide the size of the sample studied.",
    "Country_Region": "Specify the country or region where the study was conducted.",
    "Population_Demographics": "Summarize the demographics of the population studied.",
    "Data_Source": "What is the source of the data used in this study?",
    "Health_Economic_Outcome_Measured": "List the health or economic outcomes measured in this study.",
    "Key_Findings": "Summarize the main findings of the study.",
    "Causal_Estimates": "Provide estimates of causal impact found in the study.",
    "Statistical_Significance": "Comment on the statistical significance of the results.",
    "Robustness_Checks": "Describe any robustness checks performed.",
    "Heterogeneity_Subgroup_Analysis": "Explain if any heterogeneity or subgroup analysis was conducted.",
    "Limitations": "List any limitations of the study.",
    "Endogeneity_Selection_Concerns": "Describe concerns about endogeneity or selection bias, if any.",
    "Policy_Implications": "What are the policy implications of the study’s findings?"
}

thread = client.beta.threads.create(
    #assistant_id=assistant_id,
    #description="Research paper analysis"
)
thread_id = thread.id

responses = {}
for heading, query in queries.items():
    response = client.beta.threads.messages.create(
    role= "user",
    thread_id=thread_id,  # Pass the thread_id to maintain context
    content=query
    )
    responses[heading] = response.content


In [11]:

for section, response_data in responses.items():
    print(f"{section}:\n{response_data}\n{'-' * 50}")

Paper_ID:
[TextContentBlock(text=Text(annotations=[], value='Extract the unique identifier for this paper, if available. If not leave empty.'), type='text')]
--------------------------------------------------
Title:
[TextContentBlock(text=Text(annotations=[], value='Please provide the title of the paper.'), type='text')]
--------------------------------------------------
Authors:
[TextContentBlock(text=Text(annotations=[], value='List the authors of the paper.'), type='text')]
--------------------------------------------------
Year_of_Publication:
[TextContentBlock(text=Text(annotations=[], value='What year was this paper published?'), type='text')]
--------------------------------------------------
Journal_Name:
[TextContentBlock(text=Text(annotations=[], value='Provide the name of the journal where this paper was published.'), type='text')]
--------------------------------------------------
DOI_URL:
[TextContentBlock(text=Text(annotations=[], value='Extract the DOI or URL for this pa

In [None]:

# thread = client.beta.threads.create(
#     messages=[
#         {
#             "role": "user",
            
#             "content": """ 
#             Please analyze the documents in the vector store. 
#             For each document, extract and return the following information about the study design and findings in JSON format. 
#             If any field cannot be found, leave it as an empty string. Here are the categories and items to locate:
#             {
#                 "Study Design": {
#                     "Theory_Hypotheses": "(main theories or hypotheses of the study)",
#                     "Identification_Strategy": "(approach to establishing causality)",
#                     "Exogeneity_Assumptions": "(any assumptions about exogeneity)",
#                     "Control_Treatment_Definition": "(definitions of control and treatment groups)",
#                     "Intervention_Components": "(specific components of the intervention)",
#                     "Intervention_Timing": "(timing of the intervention)"
#                 },
#                 "Population": {
#                     "Sample_Size": "(size of the sample studied)",
#                     "Country_Region": "(location where the study was conducted)",
#                     "Population_Demographics": "(demographics of the population studied)"
#                 },
#                 "Data & Outcomes": {
#                     "Data_Source": "(source of data used)",
#                     "Health_Economic_Outcome_Measured": "(outcomes measured in health or economics)",
#                     "Key_Findings": "(main findings of the study)",
#                     "Causal_Estimates": "(estimates of causal impact)",
#                     "Statistical_Significance": "(significance of the results)"
#                 },
#                 "Analysis": {
#                     "Robustness_Checks": "(any robustness checks performed)",
#                     "Heterogeneity_Subgroup_Analysis": "(analysis by subgroups or heterogeneity)",
#                     "Limitations": "(limitations of the study)",
#                     "Endogeneity_Selection_Concerns": "(concerns about endogeneity or selection bias)"
#                 },
#                 "Policy & Implications": {
#                     "Policy_Implications": "(implications of findings for policy)"
#                 }
#             }

#                """
#         }
#     ]
# )



##### Step 5: Create a run and check the output

In [12]:

run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id, assistant_id=assistant.id
)
print(run.id)
messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))

message_content = messages[0].content[0].text
annotations = message_content.annotations
citations = []
for index, annotation in enumerate(annotations):
    message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
    if file_citation := getattr(annotation, "file_citation", None):
        cited_file = client.files.retrieve(file_citation.file_id)
        citations.append(f"[{index}] {cited_file.filename}")

print(message_content.value)
print("\n".join(citations))

run_iXiKFy4IzO9xe8TYvYDlss20
The paper provided does not contain a unique identifier within the initial displayed contents. Let's gather the information required from the paper:

1. **Title of the paper**: “Placebo Tests” for the Impacts of Air Pollution on Health: The Challenge of Limited Healthcare Infrastructure[0].

2. **Authors**: Bruna Guidetti, Paula Pereda, and Edson Severnini[0].

3. **Publication Year**: The paper was published in 2021[0].

4. **Journal Name**: The paper was published in AEA Papers and Proceedings[0].

5. **DOI or URL**: The paper's DOI is https://doi.org/10.1257/pandp.20211031[4].

For the further detailed questions such as theories, hypotheses, causality, sample size, etc., it appears the document has already provided information:

- **Main Theories or Hypotheses**: The paper examines how a large metropolitan area in Brazil copes with increased healthcare demand due to high levels of air pollution under hospital capacity constraints. It investigates how dai

In [None]:
print(run.status)

In [13]:
print(messages)

[Message(id='msg_WWyfViv5QM6vAWndO9pR0v34', assistant_id='asst_EEaiZaR394wa9UIPrXAX6gr0', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[FileCitationAnnotation(end_index=411, file_citation=FileCitation(file_id='file-C3yj2z2Uk1cXgXfpIOpzwE52'), start_index=287, text='【28:0†guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf】', type='file_citation'), FileCitationAnnotation(end_index=603, file_citation=FileCitation(file_id='file-C3yj2z2Uk1cXgXfpIOpzwE52'), start_index=479, text='【28:0†guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf】', type='file_citation'), FileCitationAnnotation(end_index=786, file_citation=FileCitation(file_id='file-C3yj2z2Uk1cXgXfpIOpzwE52'), start_index=662, text='【28:0†guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf】', type='fil

In [None]:
print(message_content.value)
print("\n".join(citations))