## Running prompts on OpenAI

In [3]:
%pip install markdown
%pip install annoy
%pip install openai

Note: you may need to restart the kernel to use updated packages.
Collecting annoy
  Downloading annoy-1.17.3-cp310-cp310-macosx_11_0_arm64.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: annoy
Successfully installed annoy-1.17.3
Note: you may need to restart the kernel to use updated packages.


In [4]:
from openai import OpenAI # OpenAI API
import json
import requests # to download some resources
import os # file operations
import numpy as np # linear algebra
import pandas as pd # data processing
from markdown import markdown # to render markdown
from IPython.display import Markdown
import annoy # Approximate Nearest Neighbors Oh Yeah for fast searching
import pickle
from annoy import AnnoyIndex

In [5]:
# read in our api key
with open('../api-keys/our_api_key.txt', 'r') as file:
    api_key = file.read().replace('\n', '')

# read in finns api key (the one we'll use for testing)
with open('../api-keys/finns_api_key.txt', 'r') as file:
    finns_api_key = file.read().replace('\n', '')

In [6]:
# Setting the OpenAI
client = OpenAI(api_key=finns_api_key) # using finns for testing

#### Sample of countries

In [16]:
# Sample of countries
sample_countries = ['Finland','Netherlands','Denmark','Norway', 'Chile']

# Folder with raw PDFs
folder_mds = "../data/3-naps-md"

# Importing MDs files
for file in os.listdir(folder_mds):
    for country in sample_countries:
        if country in file:  
            with open(os.path.join(folder_mds, file), "r", encoding="utf-8") as md_file:
                content = md_file.read()
            globals()[country.lower()] = content # Saving the MD file in lowercase

### 1. Using the WHOLE document

#### * Asking OpenAI -- Example: Norway ||| Question: About policies/objectives

In [22]:
country_test = norway # TO CHANGE FOR TESTING

In [13]:
response = client.chat.completions.create(
    model="gpt-4o",
    response_format={"type":"json_object"},
    messages=[
        {"role": "system", "content": """

         You are an assistant that extract and summarizes national action plans for antimicrobial resistance. Be concise and rely only on the text content.

         Your task is to identify every actionable policy goal in the document and summarize them in JSON format.

         For each goal, you must put your findings in this JSON structure. Every policy goal should look like this. If any of the fields are not applicable, mark them as null:

            policies = [
            {
                "goal": the specific policy goat set out,
                "metric": the metric used to measure the policy goal,
                "steps": a list specific steps set out to achieve the policy goal,
                "agent": a list of agents responsible for the policy goal,
                "time_limit" : the time limit set for the policy goal,
                "funding": any funding allocated to the achievement policy goal and how it is denoted,
            }
         ]

        
         
         """},
        {"role": "user", "content": country_test}
    ]
)

print(response.choices[0].message.content)

{
  "policies": [
    {
      "goal": "Reduce the total use of antibiotics by 30% among total inhabitants, measured in DDD/1000 inhabitants/day in comparison to 2012.",
      "metric": "DDD/1000 inhabitants/day",
      "steps": ["Enhance understanding among the general population and prescribers about antibiotics usage", "Implement mass media campaigns", "Improve prescription practices"],
      "agent": ["Norwegian Government", "Health and Care Services", "Norwegian Board of Health Supervision"],
      "time_limit": "2020",
      "funding": null
    },
    {
      "goal": "Be one of the three European countries with the least antibiotics use in humans, measured in DDD/1000 inhabitants/day.",
      "metric": "DDD/1000 inhabitants/day",
      "steps": ["Improve antibiotic prescribing practices", "Bring in guidelines and training for prescribers"],
      "agent": ["Norwegian Government", "Health Care Services"],
      "time_limit": "2020",
      "funding": null
    },
    {
      "goal": 

In [14]:
# see how many tokens that last query used
response.usage

CompletionUsage(completion_tokens=1255, prompt_tokens=15948, total_tokens=17203, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))

#### * Asking OpenAI -- Example: Chile ||| Question: Period for NAP

In [24]:
country_test = chile # TO CHANGE FOR TESTING

In [26]:
prompt = """From this National Action Plan, extract the period considered for the National Action Plan and return it in a structured JSON format.
The response **must** follow this exact JSON structure:
{
    "country": "Name of country as string",
    "period_start": "The year when starts the period as numeric",
    "period_end": "The year when ends the period as numeric"
}

National Action Plan:"""

response = client.chat.completions.create(
    model="gpt-4o-mini",
    response_format={"type":"json_object"},
    messages=[
        {"role": "system", "content": """
         You are an assistant that extract and summarizes national action plans for antimicrobial resistance. Be concise and rely only on the text content.
         """},
        {"role": "user", "content": prompt + country_test},
    ]
)

# Print the JSON response
print(response.choices[0].message.content)

{
    "country": "Chile",
    "period_start": 2021,
    "period_end": 2025
}


In [18]:
# see how many tokens that last query used
response.usage

CompletionUsage(completion_tokens=28, prompt_tokens=22457, total_tokens=22485, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))

### 2. Using CHUNKS and embedding strategy

#### _Setting Functions_

In [19]:
# FUNCTION: Chunk the markdown
def chunk_markdown(md_text, max_chars=3000):
    """Chunks some markdown by adding new lines until exceeding max_chars.
       Each chunk includes the last line of the previous chunk."""

    lines = md_text.split("\n")  # Split into lines
    chunks = []
    current_chunk = []
    current_length = 0

    for i, line in enumerate(lines):
        # Always include the previous line for context
        if i > 0 and current_length + len(line) > max_chars:
            chunks.append("\n".join(current_chunk))  # Save the current chunk
            current_chunk = [lines[i-1]]  # Start new chunk with the preceding line
            current_length = len(lines[i-1])  # Reset length tracker

        current_chunk.append(line)
        current_length += len(line) + 1  # +1 for the newline character

    # Add the last chunk
    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks

In [20]:
# Embedding
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

#### * Asking OpenAI -- Example: Chile ||| Question: Period for NAP

In [30]:
country_test = chile # TO CHANGE FOR TESTING
country_name = "chile" # For filenames

##### _Chuncking and embedding process (for NAP and the question)_

In [37]:
# Chunking the markdown
chunks = chunk_markdown(country_test, max_chars=3000)

# Embedding
chunks_with_embeddings = []

for i, chunk in enumerate(chunks):
    print(f".", end="")
    embedding = get_embedding(chunk)
    chunks_with_embeddings.append({"chunk": i, "text": chunk, "embedding": embedding})

# Exporting and saving
save_path = f"../data/4-chunks/{country_name}_chunks_with_embeddings.pkl"

with open(save_path, "wb") as f: # it's always good to save the data you've received
    pickle.dump(chunks_with_embeddings, f)

globals()[f"{country_name}_chunks_with_embeddings"] = chunks_with_embeddings # Saving as independent object

.........................................

In [38]:
# MAIN QUESTION TO EMBED (No include the whole prompt. Focus on the key words to find "close" chunks)
question = "Cuál es el periodo de tiempo del plan?" # TO CHANGE

In [39]:
# Embedding the question
question_embedding = get_embedding(question)

# Retrieve the object using the country_name string
chunks_with_embeddings = globals()[f"{country_name}_chunks_with_embeddings"]

# Define the Annoy index - the index is the data structure that will store the embeddings
embedding_dim = len(chunks_with_embeddings[0]["embedding"])  # Get vector size
annoy_index = AnnoyIndex(embedding_dim, "angular")  # Angular distance for similarity

# Add chunks to Annoy index
for item in chunks_with_embeddings:
    annoy_index.add_item(item["chunk"], item["embedding"])

# Build the index (the argument is the number of 'trees' - more trees = more accurate but slower)
annoy_index.build(10)

# Find the most similar chunk to the question
n_nearest = 3
nearest_chunks = annoy_index.get_nns_by_vector(question_embedding, n_nearest, search_k=-1, include_distances=False)

nearest_chunks

[24, 20, 3]

##### _Running the prompt_

In [41]:
messages = [
    {"role": "system", "content": "You are an assistant that extract key information according the questions from National Action Plan on antimicrobial resistance. Rely only on the text content."},
    {"role": "user", "content": '''From this part (chunk) of the National Action Plan, extract the period considered for the current National Action Plan and return it in a structured JSON format:
The response **must** follow this exact JSON structure:
{
    "answer_contained": bool, whether the text contains enough information to confidently answer the question,
    "period_start": "The year when starts the period as numeric",
    "period_end": "The year when ends the period as numeric",
    "evidence": str, a direct quote from the report that supports your answer
}'''},
]

responses = []

for i, chunk in enumerate(nearest_chunks):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages + [{"role": "assistant", "content": chunks[chunk]}]
    )
    print(response.choices[0].message.content)

{
    "answer_contained": true,
    "period_start": 2021,
    "period_end": 2025,
    "evidence": "El Plan Nacional contra la Resistencia a los Antimicrobianos 2021 - 2025 mantiene las mismas lineas estratégicas del plan anterior"
}
{
    "answer_contained": true,
    "period_start": 2021,
    "period_end": 2025,
    "evidence": "esta primera etapa se realizó entre diciembre de 2020 y enero de 2021"
}
{
    "answer_contained": true,
    "period_start": 2021,
    "period_end": 2025,
    "evidence": "Lineamientos estratégicos para el periodo 2021- 2025"
}
