## **Running prompts on OpenAI**

### **0. BASE SETTING**

#### _0.1 Setting_

In [20]:
%pip install markdown
%pip install annoy
%pip install openai
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [21]:
from openai import OpenAI # OpenAI API
import json
import requests # to download some resources
import os # file operations
import numpy as np # linear algebra
import pandas as pd # data processing
from markdown import markdown # to render markdown
from IPython.display import Markdown
import annoy # Approximate Nearest Neighbors Oh Yeah for fast searching
import pickle
from annoy import AnnoyIndex
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#### _0.2 Loading MD files of the sample of countries_

In [22]:
# LOADING THE MD FILES

sample_countries = ['Libya', 'Malta', 'Myanmar', 'Oman', 'Australia_2021-01-01']

# Folder with the MD files
folder_mds = "../data/3-naps-md"

# Importing MDs files
for file in os.listdir(folder_mds):
    for country in sample_countries:
        if country in file:  
            with open(os.path.join(folder_mds, file), "r", encoding="utf-8") as md_file:
                content = md_file.read()
            globals()[country.lower()] = content # Saving the MD file in lowercase

#### _0.3 Country to evaluate_

In [53]:
country_test = malta
country_name = "malta"

### **1. SETTING UP THE MODEL**

#### _1.1 API key_

In [54]:
# read in our api key
with open('../api-keys/our_api_key.txt', 'r') as file:
    api_key = file.read().replace('\n', '')

# read in finns api key (the one we'll use for testing)
with open('../api-keys/finns_api_key.txt', 'r') as file:
    finns_api_key = file.read().replace('\n', '')

In [55]:
# Setting the OpenAI
client = OpenAI(api_key=finns_api_key) # using finns for testing

#### _1.2 Chunks_

In [56]:
# FUNCTION: Chunk the markdown
def chunk_markdown(md_text, max_chars=3000):
    """Chunks some markdown by adding new lines until exceeding max_chars.
       Each chunk includes the last line of the previous chunk."""

    lines = md_text.split("\n")  # Split into lines
    chunks = []
    current_chunk = []
    current_length = 0

    for i, line in enumerate(lines):
        # Always include the previous line for context
        if i > 0 and current_length + len(line) > max_chars:
            chunks.append("\n".join(current_chunk))  # Save the current chunk
            current_chunk = [lines[i-1]]  # Start new chunk with the preceding line
            current_length = len(lines[i-1])  # Reset length tracker

        current_chunk.append(line)
        current_length += len(line) + 1  # +1 for the newline character

    # Add the last chunk
    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks

In [57]:
# Chunking the markdown
chunks = chunk_markdown(country_test, max_chars=3000)

### **2. RUNNING THE MODEL**

#### _2.1 Question 0: Period for NAP_

In [58]:
responses_period = []

for i, chunk in enumerate(chunks):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature= 0,
        top_p = 0.5,
        response_format={"type":"json_object"},
        messages=[
            {"role": "system", "content": """
            You are an assistant that extract information from national action plans for antimicrobial resistance (AMR). 
            Be concise and rely only on the text content.

            From this section of the National Action Plan, extract the period considered for the actions of the current National Action Plan and return it in a structured JSON format.
            If the period is not mentioned, return the period as null. No include other periods related to specific actions or goals.
            
            The response **must** follow this exact JSON structure:
                {
                    "period_start": The year when starts the period as numeric or null,
                    "period_end": The year when ends the period as numeric or null
                    "supporting_chunk": If the period_start and period_end are not null, include the all chunk text that supports the answer
                }
            """},
            {"role": "user", "content": chunk},
        ]
    )
    response_content = response.choices[0].message.content
    parsed_response = json.loads(response_content)
    responses_period.append(parsed_response)
    

In [59]:
start_years = [r["period_start"] for r in responses_period if r["period_start"] != None]
end_years = [r["period_end"] for r in responses_period if r["period_end"] != None]

# Function to find the most recurrent year
def most_recurrent_year(years):
    year_freq = {}
    
    # Count the frequency of each year
    for year in years:
        if year in year_freq:
            year_freq[year] += 1
        else:
            year_freq[year] = 1
    
    # Find the year with the maximum frequency
    if year_freq:
        return max(year_freq, key=year_freq.get)
    return "null"

# Final result using the most recurrent year
final_response_period = [
    {
        "country": country_name,
        "question": "period_start",
        "answer": most_recurrent_year(start_years)
    },
    {
        "country": country_name,
        "question": "period_end",
        "answer": most_recurrent_year(end_years)
    }
]

# Printing the final result
print(final_response_period)

[{'country': 'malta', 'question': 'period_start', 'answer': 2020}, {'country': 'malta', 'question': 'period_end', 'answer': 2025}]


#### _2.2 Questions 1-19: Yes/No_

In [60]:
# QUESTIONS
questions_yesno = [
    "Does the NAP adopt a One Health approach by addressing multiple priority sectors (such as human health, animal health, environment, and agriculture/food security) through strategies, policies, goals, or actions?",
    "Does the NAP include any distinct strategy/policy/goal/action explicitly targeting the priority sector of 'Human Health'?",
    "Does the NAP include any distinct strategy/policy/goal/action explicitly targeting the priority sector of 'Animal Health'?",
    "Does the NAP include any distinct strategy/policy/goal/action explicitly targeting the priority sector of 'Environment'?",
    "Does the NAP include any distinct strategy/policy/goal/action explicitly targeting the priority sector of 'Agriculture/Food Security'?",
    "Does the NAP include any mechanisms for progress reporting to track how its objectives are being met, such as an annual or semi-annual progress report, a dashboard displaying the status, or similar documents/tools?",
    "Does the NAP include any specific, time-bound targets to track progress?",
    "Does the NAP outline quantitative targets for AMR prevalence and antimicrobial use?",
    "Does the NAP specify any budget allocation assigned for the strategies/policies/goals/actions directed at tackling AMR?",
    "Does the NAP contain an assessment of future budget requirements for different activities listed in the document?",
    "Does the NAP establish a multisectoral committee or task force for AMR coordination?",
    "Does the NAP include any strategy/policy/goal/action related to strengthening the legislative and regulatory framework for AMR, including laws and regulations for antimicrobial use?",
    "Does the NAP include any strategy/policy/goal/action related to training and educational initiative to increase awareness about AMR in school curricula or professional training programs for doctors and pharmacists?",
    "Does the NAP include any strategy/policy/goal/action related to public awareness campaigns in combating AMR, such as media involvement and community engagement?",
    "Does the NAP include any strategy/policy/goal/action related to international collaboration and support for AMR initiatives, such as partnerships with organizations like World Health Organization (WHO), Food and Agriculture Organization (FAO), and World Organisation for Animal Health (OIE)?",
    "Does the NAP include any strategy/policy/goal/action related to data collection and analysis for AMR surveillance, such as developing surveillance systems, data collection tools, setting up laboratories for AMR monitoring, or other similar actions?",
    "Does the NAP include any strategy/policy/goal/action related to developing the capacity to detect and report newly emerged resistance that may constitute a public health emergency of international concern, as required by the International Health Regulations (2005)?",
    "Does the NAP include any strategy/policy/goal/action related to specific prevention strategies such as sanitation, hygiene measures, and infection prevention and control (IPC)?",
    "Does the NAP include any strategy/policy/goal/action related to implementing international standards and guidelines for antimicrobial resistance (AMR) control, such as (but not limited to) the OIE Terrestrial and Aquatic Animal Health Codes or the FAO/WHO Codex Alimentarius Code of Practice to Minimize and Contain AMR?",
    "Does the NAP include any strategy/policy/goal/action related to regulating the distribution, prescription, and dispensation of antibiotics, such as developing or maintaining essential medicine lists for antibiotics?",
    "Does the NAP include any strategy/policy/goal/action related to requiring the rational use of antibiotics through regulations/laws?",
    "Does the NAP include any strategy/policy/goal/action related to participating in international collaborative research to support the development of new medicines, diagnostic tools, and vaccines?"
]


In [61]:
responses_yesno = []

for j, question in enumerate(questions_yesno):
    final_answer = "no" # Default answer
    supporting_chunks = []

    for i, chunk in enumerate(chunks):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            temperature= 0,
            top_p = 0.5,
            response_format={"type":"json_object"},
            messages=[
                {"role": "system", "content": f"""
                You are an assistant that analyze and summarise information from national action plans for antimicrobial resistance (AMR). 
                Be concise and rely only on the text content.

                From this section of the National Action Plan, analyze and answer the following yes/no question: "{question}".
                If you cannot answer with the information provided, return null.
                
                The response **must** follow this exact JSON structure:
                {{
                    "value": yes/no/null,
                    "supporting_chunk": "If the answer is yes, include the chunk text that supports the answer"
                }}
                """},
                {"role": "user", "content": chunk},
            ]
        )
        response_content = response.choices[0].message.content
        parsed_response = json.loads(response_content)
        value = parsed_response.get("value")
        supporting_chunk = parsed_response.get("supporting_chunk")    
        if value == "yes":
            final_answer = "yes"
            supporting_chunks.append(supporting_chunk)            
        
    # After checking all chunks for this question, append the final result
    number_question = j + 1
    responses_yesno.append({
        "country": country_name,
        "question": number_question,
        "answer": final_answer,
        "supporting_chunks": supporting_chunks if final_answer == "yes" else []
    })    

In [62]:
# Printing the final responses
for response in responses_yesno:
    print(f"Question: {response['question']}")
    print(f"Final Answer: {response['answer']}")
    if response['answer'] == "yes":
        print("Supporting Chunks:")
        for chunk in response['supporting_chunks']:
            print(chunk)
    print()  

Question: 1
Final Answer: yes
Supporting Chunks:
AMR is not only a challenge in human healthcare but equally impacts on animal health and the environment. This complex, inter-related, problem requires concerted efforts by various stakeholders from a multitude of sectors - including healthcare specialists, veterinarians, pharmacists, educationalists, policy makers, legislative bodies, agriculture, industry and the public.
This reflects the wider action plans on this important issue of the Food and Agriculture Organization (FAO)'s, World Organisation for Animal Health Organisation and World Health Organisation (WHO)'s.
5.1 Legislation regulating Human Health and Animal Sector
It recognizes the interconnectedness of humans, animals and the environment and adopts a whole of Government, a whole of society and a comprehensive One Health Approach to AMR, recognizing the complexity of the issue.
The Strategy is a cross-sectoral response to the threat of AMR in Malta. Its focus lies in controll

In [63]:
# Combining responses_yesno to the response_period
summary_response = final_response_period.copy()
summary_response.extend(responses_yesno)

In [64]:
# Converting to a JSON string
summary_response_json = json.dumps(summary_response, indent=4)

# Saving
with open(f'../data/4-summary-responses-json/version_yesno_updated/summary_response_{country_name}_usingchunks.json', 'w') as f:
    f.write(summary_response_json)

---

### **3. CALCULATING THE PERFORMANCE**

### _Performance: Libya_

In [65]:
# Only for re-calculating the metrics with new annotations
country_test = malta
country_name = "malta"

In [66]:
# Loading the gold standard
annotations_df = pd.read_excel('../data/5-annotations/annotations_new.xlsx')
annotations_df = annotations_df[annotations_df['country'] == country_name]

# Loading the LLM responses
with open(f'../data/4-summary-responses-json/version_yesno_updated/summary_response_{country_name}_usingchunks.json', 'r') as f:
    llm_response = json.load(f)
llm_response_df = pd.DataFrame(llm_response)
#llm_response_df['question'] = llm_response_df['question'].astype(str)

# Merge the ground truth with the LLM responses based on both country and question
merged_df = pd.merge(annotations_df, llm_response_df, on=["country", "question"], suffixes=('_gs', '_llm'))

# keep the country of interest
merged_df


Unnamed: 0,country,question,answer_gs,comment_input_for_protocol,supporting_text_when_true_answer_is_yes_but_prediction_is_no,answer_llm,supporting_chunks
0,malta,period_start,2020,,,2020,
1,malta,period_end,2028,,,2025,
2,malta,1,yes,,,yes,[AMR is not only a challenge in human healthca...
3,malta,2,yes,,,yes,[This Strategy and Action Plan focuses on impr...
4,malta,3,yes,,,yes,[the Ministry of Agriculture Fisheries and Ani...
5,malta,4,yes,,,yes,[The main strategic aims of the AMR Strategy a...
6,malta,5,yes,,,yes,[the Ministry of Agriculture Fisheries and Ani...
7,malta,6,yes,,,yes,[Implementation and evaluation of the Strategy...
8,malta,7,yes,,,yes,[the Ministry for Health is launching a 'Strat...
9,malta,8,no,,,yes,[Set and monitor qualitative and quantitative ...


#### _3.1 Questions: Yes/No_

In [67]:
# Calculate performance metrics
merged_df['gs_binary'] = merged_df['answer_gs'].apply(lambda x: 1 if x == 'yes' else 0)
merged_df['llm_binary'] = merged_df['answer_llm'].apply(lambda x: 1 if x == 'yes' else 0)

# Calculate Accuracy, Precision, Recall, and F1 Score
accuracy = accuracy_score(merged_df['gs_binary'], merged_df['llm_binary'])
precision = precision_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
recall = recall_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
f1 = f1_score(merged_df['gs_binary'], merged_df['llm_binary'], zero_division=0)
questions_failed = merged_df[merged_df['gs_binary'] != merged_df['llm_binary']]['question'].unique()

# Performance for period extraction (period_start, period_end)
period_matches = merged_df[merged_df['question'].isin(["period_start", "period_end"])]

# For period extraction, we don't need binary conversion, so we directly compare the answers
period_matches['period_match'] = period_matches['answer_gs'] == period_matches['answer_llm']

# Calculate period extraction accuracy as the percentage of exact matches
period_accuracy = period_matches['period_match'].mean()  # Exact match percentage

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Questions where the model failed: {questions_failed}")
print(f"Period Extraction Accuracy: {period_accuracy:.2f}")

Accuracy: 0.9166666666666666
Precision: 0.95
Recall: 0.95
F1 Score: 0.95
Questions where the model failed: [8 10]
Period Extraction Accuracy: 0.50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  period_matches['period_match'] = period_matches['answer_gs'] == period_matches['answer_llm']
