In [1]:
import json 

In [2]:


# Specify the path to the JSON file
file_path = 'dataset/health_facilities.json'

# Read the JSON file
with open(file_path, 'r') as file:
    documents = json.load(file)


In [3]:
import hashlib

def generate_document_id(doc):
    """
    Generate a unique document ID based on the document content
    """
    # Serialize the document as a JSON string
    combined = f"{doc['Code']}-{doc['Name']}-{doc['Ward']}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [4]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [5]:
# Transform the keys
transformed_documents = []
for doc in documents:
    new_doc = {}
    for key, value in doc.items():
        new_key = key.lower().replace(" ", "_")
        new_doc[new_key] = value
    transformed_documents.append(new_doc)

# Replace the original documents with the transformed ones
documents = transformed_documents


In [6]:
documents[5640]

{'code': 10473,
 'name': 'Kanjuiri Dispensary',
 'registration_number': None,
 'keph_level': 'Level 2',
 'facility_type': 'Dispensaries and clinic-out patient only',
 'owner': 'Ministry of Health',
 'regulatory_body': 'Ministry of Health',
 'beds': 0,
 'cots': 0,
 'county': 'NYANDARUA',
 'constituency': 'OL KALOU',
 'sub_county': 'olkalou',
 'ward': 'KANJUIRI RANGE',
 'operation_status': 'Operational',
 'open_whole_day': 'No',
 'open_public_holidays': 'No',
 'open_weekends': 'No',
 'open_late_night': 'No',
 'service_names': None,
 'approved': 'Yes',
 'public_visible': 'Yes',
 'closed': 'No',
 'id': '77312f84'}

In [7]:
from collections import defaultdict

In [8]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [9]:
len(hashes), len(documents)

(8932, 8932)

In [10]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [11]:
with open('dataset/health-facilities-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [12]:
!head -n 200 dataset/health-facilities-with-ids.json

[
  {
    "code": 22998,
    "name": "Kaka Medical Clinic",
    "registration_number": null,
    "keph_level": "Level 2",
    "facility_type": "Dispensaries and clinic-out patient only",
    "owner": "Private Practice - Medical Specialist",
    "regulatory_body": null,
    "beds": 0,
    "cots": 0,
    "county": "KAKAMEGA",
    "constituency": "LURAMBI",
    "sub_county": "lurambi",
    "ward": "SHEYWE",
    "operation_status": "Operational",
    "open_whole_day": "No",
    "open_public_holidays": "No",
    "open_weekends": "No",
    "open_late_night": "No",
    "service_names": null,
    "approved": "Yes",
    "public_visible": "Yes",
    "closed": "No",
    "id": "a2ce25cd"
  },
  {
    "code": 22985,
    "name": "KOPANGA DISPENSARY",
    "registration_number": null,
    "keph_level": "Level 2",
    "facility_type": "Basic primary health care facility",
    "owner": "Ministry of Health",
    "regulatory_body": null,
    "beds": 2,
    "cots": 1,
    "county": "MIGORI",
    "constitue

In [15]:
prompt_template = """
You emulate a user interacting with a chatbot that provides information about health facilities in a region.
Imagine the user is looking for information on a medical facility near them. 

This record describes a health facility named "{Name}". 

Formulate 3 questions that might be asked to get a better understanding of the facility. Do not hallucinate or ask questions that require external knowledge.
Do not answer the questions, just provide the questions themselves. The questions should be answerable based on the information in the record and should not be too similar to each other.
Focus on the most important information for a potential patient, and avoid using the exact wording from the record.

{record_content}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3"]
""".strip()


In [16]:
from openai import OpenAI
import os

client = OpenAI(
    base_url="https://models.inference.ai.azure.com",
    api_key=os.environ["GITHUB_TOKEN"]
)

In [17]:
def generate_questions(doc):
    record_content = json.dumps(doc, indent=2)
    prompt = prompt_template.format(Name=doc['name'].lower(), record_content=record_content)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [18]:
results = {}

In [19]:
from tqdm.auto import tqdm

In [21]:
from time import sleep

for i, doc in enumerate(tqdm(documents)):
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

    if (i + 1) % 10 == 0:
        sleep(47)


  0%|          | 0/8932 [00:00<?, ?it/s]

RateLimitError: Error code: 429 - {'error': {'code': 'RateLimitReached', 'message': 'Rate limit of 50 per 86400s exceeded for UserByModelByDay. Please wait 85981 seconds before retrying.', 'details': None}}

In [22]:
len(results)

45

In [23]:
results

{'a2ce25cd': '["What are the operating hours of Kaka Medical Clinic?", "What types of services are provided at Kaka Medical Clinic?", "Is Kaka Medical Clinic approved and publicly visible?"]',
 '93d22675': '["What is the KEPH level of Kopanga Dispensary?", "Who operates Kopanga Dispensary?", "Is Kopanga Dispensary open on weekends?"]',
 'b527cddb': '["What are the operating hours of Fairview Medical Centre during weekends?", "Is Fairview Medical Centre currently open and operational?", "How many beds and cots are available at Fairview Medical Centre?"]',
 '61733be4': '["What is the bed capacity at Radiant Group of Hospitals-Umoja?", "Is Radiant Group of Hospitals-Umoja open during weekends and public holidays?", "Who regulates Radiant Group of Hospitals-Umoja?"]',
 '5e4acb57': '[\n  "What are the operation hours for Songot Medical Clinic?",\n  "Who owns and operates Songot Medical Clinic?",\n  "Is Songot Medical Clinic open on weekends or public holidays?"\n]',
 'eec774b8': '["What are

In [24]:
parsed_result = {}

for doc_id, json_questions in results.items():
    parsed_result[doc_id] = json.loads(json_questions)

In [25]:
parsed_result

{'a2ce25cd': ['What are the operating hours of Kaka Medical Clinic?',
  'What types of services are provided at Kaka Medical Clinic?',
  'Is Kaka Medical Clinic approved and publicly visible?'],
 '93d22675': ['What is the KEPH level of Kopanga Dispensary?',
  'Who operates Kopanga Dispensary?',
  'Is Kopanga Dispensary open on weekends?'],
 'b527cddb': ['What are the operating hours of Fairview Medical Centre during weekends?',
  'Is Fairview Medical Centre currently open and operational?',
  'How many beds and cots are available at Fairview Medical Centre?'],
 '61733be4': ['What is the bed capacity at Radiant Group of Hospitals-Umoja?',
  'Is Radiant Group of Hospitals-Umoja open during weekends and public holidays?',
  'Who regulates Radiant Group of Hospitals-Umoja?'],
 '5e4acb57': ['What are the operation hours for Songot Medical Clinic?',
  'Who owns and operates Songot Medical Clinic?',
  'Is Songot Medical Clinic open on weekends or public holidays?'],
 'eec774b8': ['What are th

In [26]:
doc_index = {doc['id']: doc for doc in documents}

In [27]:
final_results = []

for doc_id, questions in parsed_result.items():
    county = doc_index[doc_id]['county'].lower()
    for q in questions:
        final_results.append((q, county, doc_id))

In [28]:
final_results

[('What are the operating hours of Kaka Medical Clinic?',
  'kakamega',
  'a2ce25cd'),
 ('What types of services are provided at Kaka Medical Clinic?',
  'kakamega',
  'a2ce25cd'),
 ('Is Kaka Medical Clinic approved and publicly visible?',
  'kakamega',
  'a2ce25cd'),
 ('What is the KEPH level of Kopanga Dispensary?', 'migori', '93d22675'),
 ('Who operates Kopanga Dispensary?', 'migori', '93d22675'),
 ('Is Kopanga Dispensary open on weekends?', 'migori', '93d22675'),
 ('What are the operating hours of Fairview Medical Centre during weekends?',
  'nairobi',
  'b527cddb'),
 ('Is Fairview Medical Centre currently open and operational?',
  'nairobi',
  'b527cddb'),
 ('How many beds and cots are available at Fairview Medical Centre?',
  'nairobi',
  'b527cddb'),
 ('What is the bed capacity at Radiant Group of Hospitals-Umoja?',
  'nairobi',
  '61733be4'),
 ('Is Radiant Group of Hospitals-Umoja open during weekends and public holidays?',
  'nairobi',
  '61733be4'),
 ('Who regulates Radiant G

In [29]:
import pandas as pd

In [30]:
df = pd.DataFrame(final_results, columns=["question", "county", "document"])

In [31]:
df.to_csv("dataset/ground-truth-data.csv", index=False)

In [32]:
facilities_by_county = df.groupby('county').size().reset_index(name='count')
facilities_by_county = facilities_by_county.sort_values('count', ascending=False)

print(facilities_by_county)

         county  count
13      nairobi     45
9         kitui     12
6       kericho      9
10         lamu      6
18      turkana      6
14        narok      6
19  uasin gishu      6
5      kakamega      6
4       kajiado      6
7        kilifi      3
8        kisumu      3
1         busia      3
11       migori      3
12     murang'a      3
3      homa bay      3
15    nyandarua      3
16        nyeri      3
17  trans nzoia      3
2       garissa      3
0       bungoma      3
