##### Step 1: Create a new Assistant with File Search Enabled

In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os

env_path = "../../keys.env"

load_dotenv(dotenv_path=env_path)
api_key = os.getenv("API_KEY")
project = os.getenv("PROJECT")
organization = os.getenv("ORGANIZATION")


In [None]:

 
client = OpenAI(
 organization=organization,
 project=project,
 api_key=api_key 
)

 
assistant = client.beta.assistants.create(
  name="Research Paper Analyst",
  instructions="You are an scientific researcher in the area of health economics. Use your knowledge base to help answer questions about the health economics research papers.",
  model="gpt-4o",
  tools=[{"type": "file_search"}],
)

##### Step 2: Upload files and add them to a Vector Store

In [5]:
vector_store = client.beta.vector_stores.create(name="Health Economics Research Papers")
 
file_paths = ["test_pdfs/guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf"]
file_streams = [open(path, "rb") for path in file_paths]
 

file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)
 
print(file_batch.status)
print(file_batch.file_counts)

completed
FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1)


In [6]:
print(file_batch.status)

completed


.upload_and_poll():
1. Upload the file: The function uploads a file to the OpenAI vector store. The file could be in-memory (as bytes) or from a specified path.
2. Poll for completion: After uploading, it checks (polls) whether the file has been successfully processed and indexed. This is useful in cases where the processing takes time, and you want to wait until the file is fully ready for use in tasks like document search or interaction with an assistant.

##### Step 3: Update the assistant to use the new Vector Store

In [7]:
assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

##### Step 4: Create a thread

In [10]:
# Create a thread and attach the file to the message
thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": """
            Please analyze the documents in the vector store. 
            For these documents find the following information about the study design and return them in json format. Each
            information you need to find is seperated by a ',' and short description of the metric is given inside '()'.
            Here are the things you must find from the documents:  
            Sample size (Number of participants in the study),
            Intervention type (Description of the intervention (e.g., drug, policy change),
            Randomization Method (Details on how participants were randomized),
            Country/Region (Where the study was conducted)
            Population/Demographics (Key demographic details (e.g., age group, gender, socio-economic status),           
            Health Outcome Measured (Specific health outcome(s) focused on)
               """
        }
    ]
)



##### Step 5: Create a run and check the output

In [11]:

run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id, assistant_id=assistant.id
)

messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))

message_content = messages[0].content[0].text
annotations = message_content.annotations
citations = []
for index, annotation in enumerate(annotations):
    message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
    if file_citation := getattr(annotation, "file_citation", None):
        cited_file = client.files.retrieve(file_citation.file_id)
        citations.append(f"[{index}] {cited_file.filename}")

print(message_content.value)
print("\n".join(citations))

Based on the information extracted from the document, here is the study design summary in JSON format:

```json
{
  "Sample size": "89,492 observations",
  "Intervention type": "Air pollution exposure (measuring PM10 levels) as an indicator of health impacts",
  "Randomization Method": "Wind speed used as an instrument to address endogenous exposure to air pollution",
  "Country/Region": "Sao Paulo Metropolitan Area, Brazil",
  "Population/Demographics": "Children aged one to five years",
  "Health Outcome Measured": "Pediatric hospitalizations for respiratory diseases including asthma and pneumonia"
}
```

These details are derived from the study focusing on air pollution's impact on health within the Sao Paulo Metropolitan Area, where data was collected between 2015 and 2017[0].
[0] guidetti-et-al-2021-placebo-tests-for-the-impacts-of-air-pollution-on-health-the-challenge-of-limited-health-care.pdf


In [None]:
print(message_content.value)
print("\n".join(citations))