In [1]:
import pandas as pd

## Retrieval Evaluation

In [2]:
from openai import OpenAI
import os

In [4]:
client = OpenAI(
    base_url="https://models.inference.ai.azure.com",
    api_key=os.environ["GITHUB_TOKEN"]
)

In [5]:
df = pd.read_csv('kenya_health_facilities_clean.csv')
documents = df.to_dict(orient='records')

In [6]:
prompt_template = """
You emulate a user of our healthcare facility assistant application.
Formulate 5 questions this user might ask based on the provided healthcare facility. The record
should contain the answer to the questions, and the questions should be complete and not too short.
Use as fewer words as possible from the record. 

The record:

name: {name}
keph_level: {keph_level}
facility_type: {facility_type}
owner: {owner}
regulatory_body: {regulatory_body}
beds: {beds}
cots: {cots}
county: {county}
constituency: {constituency}
sub_county: {sub_county}
ward: {ward} 
operation_status: {operation_status}
open_whole_day: {open_whole_day}
open_public_holidays: {open_public_holidays}
open_weekends: {open_weekends}
open_late_night: {open_late_night}
approved: {approved}
public_visible: {public_visible}
closed: {closed}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [7]:
prompt = prompt_template.format(**documents[0])

In [8]:
def llm(prompt):
    response = client.chat.completions.create(
        model = 'gpt-4o',
        messages = [{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [9]:
answers = llm(prompt)

In [10]:
import json

In [11]:
json.loads(answers)

['What type of healthcare facility is Fairview Medical Centre categorized as?',
 'Is Fairview Medical Centre operational on weekends?',
 'Which regulatory body oversees Fairview Medical Centre?',
 'How many beds does Fairview Medical Centre have?',
 'Is Fairview Medical Centre located in the Nairobi county?']

In [12]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [13]:
from tqdm.auto import tqdm

In [14]:
results = {}

In [18]:
from time import sleep

for i, doc in enumerate(tqdm(documents)): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions

    if (i + 1) % 10 == 0:
        sleep(60)

  0%|          | 0/8598 [00:00<?, ?it/s]

RateLimitError: Error code: 429 - {'error': {'code': 'RateLimitReached', 'message': 'Rate limit of 10 per 60s exceeded for UserByModelByMinute. Please wait 45 seconds before retrying.', 'details': None}}

In [19]:
results

{2: {'questions': ['What level of care does Fairview Medical Centre provide?',
   'Is Fairview Medical Centre a public or a private facility?',
   'Does Fairview Medical Centre operate on weekends?',
   'How many beds are available at Fairview Medical Centre?',
   'In which constituency is Fairview Medical Centre located?']},
 3: ['What is the name of the secondary care hospital in Umoja?',
  'How many beds are available at the facility in Umoja?',
  'Is the hospital in Umoja open on public holidays?',
  'In which constituency is the hospital located?',
  'Is the hospital in Umoja visible to the public?'],
 5: ['What is the official name of the healthcare facility?',
  'Who owns the Vital Solutions Health Centre?',
  'What is the operation status of the facility?',
  'Is the facility approved by the regulatory body?',
  'How many beds are available at the health centre?'],
 7: ['What level is the Ankara Medical Centre classified as?',
  'Is Ankara Medical Centre operational on weekends

In [22]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [23]:
final_results

[(2, 'questions'),
 (3, 'What is the name of the secondary care hospital in Umoja?'),
 (3, 'How many beds are available at the facility in Umoja?'),
 (3, 'Is the hospital in Umoja open on public holidays?'),
 (3, 'In which constituency is the hospital located?'),
 (3, 'Is the hospital in Umoja visible to the public?'),
 (5, 'What is the official name of the healthcare facility?'),
 (5, 'Who owns the Vital Solutions Health Centre?'),
 (5, 'What is the operation status of the facility?'),
 (5, 'Is the facility approved by the regulatory body?'),
 (5, 'How many beds are available at the health centre?'),
 (7, 'What level is the Ankara Medical Centre classified as?'),
 (7, 'Is Ankara Medical Centre operational on weekends?'),
 (7, 'How many beds are available at the Ankara Medical Centre?'),
 (7, 'Who is the regulatory body for Ankara Medical Centre?'),
 (7, 'Is the Ankara Medical Centre privately owned or public?'),
 (9, 'What is the operational status of Nazareth Medical Services-Githura

In [26]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [28]:
df_results.to_csv('./ground_truth_retrieval.csv', index=False)

In [None]:
!