## **Running prompts on OpenAI**

### **0. BASE SETTING**

#### _0.1 Setting_

In [1]:
%pip install markdown
%pip install annoy
%pip install openai
%pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;4

In [2]:
from openai import OpenAI # OpenAI API
import json
import requests # to download some resources
import os # file operations
import numpy as np # linear algebra
import pandas as pd # data processing
from markdown import markdown # to render markdown
from IPython.display import Markdown
import annoy # Approximate Nearest Neighbors Oh Yeah for fast searching
import pickle
from annoy import AnnoyIndex
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#### _0.2 Loading MD files of the sample of countries_

In [3]:
# LOADING THE MD FILES

sample_countries = ['Netherlands','Norway', 'Chile']

# Folder with the MD files
folder_mds = "../data/3-naps-md"

# Importing MDs files
for file in os.listdir(folder_mds):
    for country in sample_countries:
        if country in file:  
            with open(os.path.join(folder_mds, file), "r", encoding="utf-8") as md_file:
                content = md_file.read()
            globals()[country.lower()] = content # Saving the MD file in lowercase

#### _0.3 Country to evaluate_

In [11]:
country_test = netherlands
country_name = "netherlands"

### **1. SETTING UP THE MODEL**

#### _1.1 API key_

In [5]:
# read in our api key
with open('../api-keys/our_api_key.txt', 'r') as file:
    api_key = file.read().replace('\n', '')

# read in finns api key (the one we'll use for testing)
with open('../api-keys/finns_api_key.txt', 'r') as file:
    finns_api_key = file.read().replace('\n', '')

In [6]:
# Setting the OpenAI
client = OpenAI(api_key=finns_api_key) # using finns for testing

#### _1.2 Chunks_

In [7]:
# FUNCTION: Chunk the markdown
def chunk_markdown(md_text, max_chars=3000):
    """Chunks some markdown by adding new lines until exceeding max_chars.
       Each chunk includes the last line of the previous chunk."""

    lines = md_text.split("\n")  # Split into lines
    chunks = []
    current_chunk = []
    current_length = 0

    for i, line in enumerate(lines):
        # Always include the previous line for context
        if i > 0 and current_length + len(line) > max_chars:
            chunks.append("\n".join(current_chunk))  # Save the current chunk
            current_chunk = [lines[i-1]]  # Start new chunk with the preceding line
            current_length = len(lines[i-1])  # Reset length tracker

        current_chunk.append(line)
        current_length += len(line) + 1  # +1 for the newline character

    # Add the last chunk
    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks

In [8]:
# Chunking the markdown
chunks = chunk_markdown(country_test, max_chars=3000)

### **2. RUNNING THE MODEL**

#### _2.1 Question 0: Period for NAP_

In [9]:
responses_period = []

for i, chunk in enumerate(chunks):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature= 0,
        top_p = 0.5,
        response_format={"type":"json_object"},
        messages=[
            {"role": "system", "content": """
            You are an assistant that extract information from national action plans for antimicrobial resistance (AMR). 
            Be concise and rely only on the text content.

            From this section of the National Action Plan, extract the period considered for the actions of the current National Action Plan and return it in a structured JSON format.
            If the period is not mentioned, return the period as null. No include other periods related to specific actions or goals.
            
            The response **must** follow this exact JSON structure:
                {
                    "period_start": The year when starts the period as numeric or null,
                    "period_end": The year when ends the period as numeric or null
                    "supporting_chunk": If the period_start and period_end are not null, include the all chunk text that supports the answer
                }
            """},
            {"role": "user", "content": chunk},
        ]
    )
    response_content = response.choices[0].message.content
    parsed_response = json.loads(response_content)
    responses_period.append(parsed_response)
    

In [12]:
start_years = [r["period_start"] for r in responses_period if r["period_start"] != None]
end_years = [r["period_end"] for r in responses_period if r["period_end"] != None]

# Function to find the most recurrent year
def most_recurrent_year(years):
    year_freq = {}
    
    # Count the frequency of each year
    for year in years:
        if year in year_freq:
            year_freq[year] += 1
        else:
            year_freq[year] = 1
    
    # Find the year with the maximum frequency
    if year_freq:
        return max(year_freq, key=year_freq.get)
    return "null"

# Final result using the most recurrent year
final_response_period = [
    {
        "country": country_name,
        "question": "period_start",
        "answer": most_recurrent_year(start_years)
    },
    {
        "country": country_name,
        "question": "period_end",
        "answer": most_recurrent_year(end_years)
    }
]

# Printing the final result
print(final_response_period)

[{'country': 'netherlands', 'question': 'period_start', 'answer': 2016}, {'country': 'netherlands', 'question': 'period_end', 'answer': 2016}]


#### _2.2 Questions 1-19: Yes/No_

In [13]:
# QUESTIONS
questions_yesno = [
    "Does the NAP include any strategy/policy/goal/action related to the priority sector of 'Human Health'?",
    "Does the NAP include any strategy/policy/goal/action related to the priority sector of 'Animal Health'?",
    "Does the NAP include any strategy/policy/goal/action related to the priority sector of 'Environment'?",
    "Does the NAP include any strategy/policy/goal/action related to the priority sector of 'Agriculture/Food Security'?",
    "Does the NAP include any mechanisms for progress reporting to track how its objectives are being met, such as an annual or semi-annual progress report, a dashboard displaying the status, or similar documents/tools?",
    "Does the NAP include any specific, time-bound targets to track progress?",
    "Does the NAP specify any budget allocation assigned for the strategies/policies/goals/actions directed at tackling AMR?",
    "Does the NAP establish a multisectoral committee or task force for AMR coordination?",
    "Does the NAP include any strategy/policy/goal/action related to training and educational initiative to increase awareness about AMR in school curricula or professional training programs for doctors and pharmacists?",
    "Does the NAP include any strategy/policy/goal/action related to public awareness campaigns in combating AMR, such as media involvement and community engagement?",
    "Does the NAP include any strategy/policy/goal/action related to international collaboration and support for AMR initiatives, such as partnerships with organizations like World Health Organization (WHO), Food and Agriculture Organization (FAO), and World Organisation for Animal Health (OIE)?",
    "Does the NAP include any strategy/policy/goal/action related to data collection and analysis for AMR surveillance, such as developing surveillance systems, data collection tools, setting up laboratories for AMR monitoring, or other similar actions?",
    "Does the NAP include any strategy/policy/goal/action related to developing the capacity to detect and report newly emerged resistance that may constitute a public health emergency of international concern, as required by the International Health Regulations (2005)?",
    "Does the NAP include any strategy/policy/goal/action related to specific prevention strategies such as sanitation, hygiene measures, and infection control?",
    "Does the NAP include any strategy/policy/goal/action related to implementing the standards outlined in the OIE Terrestrial and Aquatic Animal Health Codes or the World Health Organization's/Food and Agriculture Organization's Codex Alimentarius Code of Practice to Minimize and Contain Antimicrobial Resistance?",
    "Does the NAP include any strategy/policy/goal/action related to regulating the distribution, prescription, and dispensation of antibiotics, such as developing or maintaining essential medicine lists for antibiotics?",
    "Does the NAP include any strategy/policy/goal/action related to strengthening the legislative and regulatory framework for AMR, including laws and regulations for antimicrobial use?",
    "Does the NAP include any strategy/policy/goal/action related to requiring the rational use of antibiotics through regulations/laws?",
    "Does the NAP include any strategy/policy/goal/action related to participating in international collaborative research to support the development of new medicines, diagnostic tools, and vaccines?"
]


In [14]:
responses_yesno = []

for j, question in enumerate(questions_yesno):
    final_answer = "no" # Default answer
    supporting_chunks = []

    for i, chunk in enumerate(chunks):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            temperature= 0,
            top_p = 0.5,
            response_format={"type":"json_object"},
            messages=[
                {"role": "system", "content": f"""
                You are an assistant that analyze and summarise information from national action plans for antimicrobial resistance (AMR). 
                Be concise and rely only on the text content.

                From this section of the National Action Plan, analyze and answer the following yes/no question: "{question}".
                If you cannot answer with the information provided, return null.
                
                The response **must** follow this exact JSON structure:
                {{
                    "value": yes/no/null,
                    "supporting_chunk": "If the answer is yes, include the chunk text that supports the answer"
                }}
                """},
                {"role": "user", "content": chunk},
            ]
        )
        response_content = response.choices[0].message.content
        parsed_response = json.loads(response_content)
        value = parsed_response.get("value")
        supporting_chunk = parsed_response.get("supporting_chunk")    
        if value == "yes":
            final_answer = "yes"
            supporting_chunks.append(supporting_chunk)            
        
    # After checking all chunks for this question, append the final result
    number_question = j + 1
    responses_yesno.append({
        "country": country_name,
        "question": number_question,
        "answer": final_answer,
        "supporting_chunks": supporting_chunks if final_answer == "yes" else []
    })  
    

In [15]:
# Printing the final responses
for response in responses_yesno:
    print(f"Question: {response['question']}")
    print(f"Final Answer: {response['answer']}")
    if response['answer'] == "yes":
        print("Supporting Chunks:")
        for chunk in response['supporting_chunks']:
            print(chunk)
    print()  

Question: 1
Final Answer: yes
Supporting Chunks:
The approach to antibiotic resistance we propose in this letter addresses all domains where human health is threatened by antibiotic-resistant bacteria; healthcare, animals, food and environment.
Good implementation of the WHO Global Action Plan on antimicrobial resistance in the coming years is of great importance. We support the WHO, FAO and OIE in these efforts. In concrete terms, this includes a specific financial contribution from the Ministry of Health, Welfare and Sport to the WHO, and strategic secondments to the WHO.
Antibiotic resistance is one of the Cabinet's priorities during this presidency.
The specific objective is reaching consensus on surveillance of resistant bacteria and antibiotic use in human healthcare, as a basis for developing infection prevention measures and promoting prudent use of antibiotics.
Avoidable harm and mortality among patients due to infections caused by resistant bacteria must be prevented wherever

In [16]:
# Combining responses_yesno to the response_period
summary_response = final_response_period.copy()
summary_response.extend(responses_yesno)

In [17]:
# Converting to a JSON string
summary_response_json = json.dumps(summary_response, indent=4)

# Saving
with open(f'../data/4-summary-responses-json/summary_response_{country_name}_usingchunks.json', 'w') as f:
    f.write(summary_response_json)

---

### **3. CALCULATING THE PERFORMANCE**

### _Performance: Chile_

In [5]:
# Only for re-calculating the metrics with new annotations
country_test = chile
country_name = "chile"

In [7]:
# Loading the gold standard
annotations_df = pd.read_excel('../data/5-annotations/annotations.xlsx')
annotations_df = annotations_df[annotations_df['country'] == country_name]

# Loading the LLM responses
with open(f'../data/4-summary-responses-json/summary_response_{country_name}_usingchunks.json', 'r') as f:
    llm_response = json.load(f)
llm_response_df = pd.DataFrame(llm_response)
#llm_response_df['question'] = llm_response_df['question'].astype(str)

# Merge the ground truth with the LLM responses based on both country and question
merged_df = pd.merge(annotations_df, llm_response_df, on=["country", "question"], suffixes=('_gs', '_llm'))

# keep the country of interest
merged_df.head()


Unnamed: 0,country,question,answer_gs,comment_input_for_protocol,supporting_text_when_true_answer_is_yes_but_prediction_is_no,answer_llm,supporting_chunks
0,chile,period_start,2021,,,2021,
1,chile,period_end,2025,,,2025,
2,chile,1,yes,,,yes,[Generar instancias de capacitación locales pa...
3,chile,2,yes,,,yes,[111.6. Prevenir y controlar infecciones en la...
4,chile,3,yes,,,yes,[111.5. Incorporar medidas regulatorias en rel...


In [None]:
# Saving file .xlsx with results
folder_per = f"../data/6-performance/performance_{country_name}.xlsx"

# Verifying if the file already exists
if not os.path.exists(folder_per):
    # Si el archivo no existe, guardar el DataFrame
    merged_df.to_excel(folder_per, index=False, engine="openpyxl")
else:
    # If file do exist, read it and merge
    performance_df = pd.read_excel(folder_per)
    performance_select_df=performance_df.loc[:, ['country', 'question', 'answer_llm_chunking','supporting_chunks']]
    merged2_df = merged_df.join(performance_select_df, how='inner',lsuffix='_left', rsuffix='_right')
    # Dropping repeated columns
    merged2_df = merged2_df.drop(columns=['country_right', 'question_right'])
    # Changing names of the columns
    merged2_df = merged2_df.rename(columns={'country_left': 'country', 'question_left': 'question'})
    merged2_df.to_excel(folder_per, index=False, engine="openpyxl")
merged2_df.head()


#### _3.1 Questions: Yes/No_

In [8]:
# Calculate performance metrics
merged_df['gs_binary'] = merged_df['answer_gs'].apply(lambda x: 1 if x == 'yes' else 0)
merged_df['llm_binary'] = merged_df['answer_llm'].apply(lambda x: 1 if x == 'yes' else 0)

# Calculate Accuracy, Precision, Recall, and F1 Score
accuracy = accuracy_score(merged_df['gs_binary'], merged_df['llm_binary'])
precision = precision_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
recall = recall_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
f1 = f1_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
questions_failed = merged_df[merged_df['gs_binary'] != merged_df['llm_binary']]['question'].unique()

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Questions where the model failed: {questions_failed}")

Accuracy: 0.9047619047619048
Precision: 0.875
Recall: 1.0
F1 Score: 0.9333333333333333
Questions where the model failed: [11 19]


#### _3.2 Questions: Extraction (period) ... IN PROGRESS_

### _Performance: Netherlands_

In [18]:
# Only for re-calculating the metrics with new annotations
country_test = netherlands
country_name = "netherlands"

In [77]:
# Loading the gold standard
annotations_df = pd.read_excel('../data/5-annotations/annotations.xlsx')
annotations_df = annotations_df[annotations_df['country'] == country_name]

# Loading the LLM responses
with open(f'../data/4-summary-responses-json/summary_response_{country_name}_usingchunks.json', 'r') as f:
    llm_response = json.load(f)
llm_response_df = pd.DataFrame(llm_response)
#llm_response_df['question'] = llm_response_df['question'].astype(str)

# Merge the ground truth with the LLM responses based on both country and question
merged_df = pd.merge(annotations_df, llm_response_df, on=["country", "question"], suffixes=('_gs', '_llm_chunking'))

merged_df.head()


Unnamed: 0,country,question,answer_gs,comment_input_for_protocol,supporting_text_when_true_answer_is_yes_but_prediction_is_no,answer_llm_chunking,supporting_chunks
0,netherlands,period_start,2015,,,2016,
1,netherlands,period_end,2019,,,2016,
2,netherlands,1,yes,,Actors agree to the following goals: The numbe...,yes,[The approach to antibiotic resistance we prop...
3,netherlands,2,yes,,The reduction of antibiotic use has flattened....,yes,[The main focus within the approach to antibio...
4,netherlands,3,yes,,"Despite these uncertainties, the research by Z...",yes,[The approach to antibiotic resistance we prop...


In [96]:
# Saving file .xlsx with results
folder_per = f"../data/6-performance/performance_{country_name}.xlsx"

# Verificar si el archivo existe
if not os.path.exists(folder_per):
    # Si el archivo no existe, guardar el DataFrame
    merged_df.to_excel(folder_per, index=False, engine="openpyxl")
else:
    # Si el archivo existe, leerlo y realizar el merge
    performance_df = pd.read_excel(folder_per)
    performance_df_select=performance_df.loc[:, ['country', 'question', 'answer_llm_full_document','supporting_text']]
    merged2_df = merged_df.join(performance_select_df, how='inner',lsuffix='_left', rsuffix='_right')
    # Dropping repeated columns
    merged2_df = merged2_df.drop(columns=['country_right', 'question_right'])
    # Changing names of the columns
    merged2_df = merged2_df.rename(columns={'country_left': 'country', 'question_left': 'question'})
    merged2_df.to_excel(folder_per, index=False, engine="openpyxl")
merged2_df.head()


Unnamed: 0,country,question,answer_gs,comment_input_for_protocol,supporting_text_when_true_answer_is_yes_but_prediction_is_no,answer_llm_chunking,supporting_chunks,answer_llm_full_document,supporting_text
0,netherlands,period_start,2015,,,2016,,2015,
1,netherlands,period_end,2019,,,2016,,2019,
2,netherlands,1,yes,,Actors agree to the following goals: The numbe...,yes,[The approach to antibiotic resistance we prop...,yes,The approach to antibiotic resistance addresse...
3,netherlands,2,yes,,The reduction of antibiotic use has flattened....,yes,[The main focus within the approach to antibio...,yes,The goal is reduction and prudent use of antib...
4,netherlands,3,yes,,"Despite these uncertainties, the research by Z...",yes,[The approach to antibiotic resistance we prop...,yes,"Knowledge about the development of, the occurr..."


#### _3.1 Questions: Yes/No_

In [22]:
# Calculate performance metrics
merged_df['gs_binary'] = merged_df['answer_gs'].apply(lambda x: 1 if x == 'yes' else 0)
merged_df['llm_binary'] = merged_df['answer_llm_chunking'].apply(lambda x: 1 if x == 'yes' else 0)

# Calculate Accuracy, Precision, Recall, and F1 Score
accuracy = accuracy_score(merged_df['gs_binary'], merged_df['llm_binary'])
precision = precision_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
recall = recall_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
f1 = f1_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
questions_failed = merged_df[merged_df['gs_binary'] != merged_df['llm_binary']]['question'].unique()

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Questions where the model failed: {questions_failed}")

Accuracy: 0.6190476190476191
Precision: 0.6
Recall: 0.8181818181818182
F1 Score: 0.6923076923076923
Questions where the model failed: [4 7 9 12 13 16 17 18]
