In [1]:
import json 

In [2]:


# Specify the path to the JSON file
file_path = 'dataset/health_facilities.json'

# Read the JSON file
with open(file_path, 'r') as file:
    documents = json.load(file)


In [3]:
documents

[{'Code': 22998,
  'Name': 'Kaka Medical Clinic',
  'Registration_number': None,
  'Keph level': 'Level 2',
  'Facility type': 'Dispensaries and clinic-out patient only',
  'Owner': 'Private Practice - Medical Specialist',
  'Regulatory body': None,
  'Beds': 0,
  'Cots': 0,
  'County': 'KAKAMEGA',
  'Constituency': 'LURAMBI',
  'Sub county': 'lurambi',
  'Ward': 'SHEYWE',
  'Operation status': 'Operational',
  'Open_whole_day': 'No',
  'Open_public_holidays': 'No',
  'Open_weekends': 'No',
  'Open_late_night': 'No',
  'Service_names': None,
  'Approved': 'Yes',
  'Public visible': 'Yes',
  'Closed': 'No'},
 {'Code': 22985,
  'Name': 'KOPANGA DISPENSARY',
  'Registration_number': None,
  'Keph level': 'Level 2',
  'Facility type': 'Basic primary health care facility',
  'Owner': 'Ministry of Health',
  'Regulatory body': None,
  'Beds': 2,
  'Cots': 1,
  'County': 'MIGORI',
  'Constituency': 'SUNA WEST',
  'Sub county': 'suna west sub county',
  'Ward': 'WASIMBETE',
  'Operation status

In [4]:
import hashlib

def generate_document_id(doc):
    """
    Generate a unique document ID based on the document content
    """
    # Serialize the document as a JSON string
    combined = f"{doc['Code']}-{doc['Name']}-{doc['Ward']}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [5]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [6]:
documents[5640]

{'Code': 10473,
 'Name': 'Kanjuiri Dispensary',
 'Registration_number': None,
 'Keph level': 'Level 2',
 'Facility type': 'Dispensaries and clinic-out patient only',
 'Owner': 'Ministry of Health',
 'Regulatory body': 'Ministry of Health',
 'Beds': 0,
 'Cots': 0,
 'County': 'NYANDARUA',
 'Constituency': 'OL KALOU',
 'Sub county': 'olkalou',
 'Ward': 'KANJUIRI RANGE',
 'Operation status': 'Operational',
 'Open_whole_day': 'No',
 'Open_public_holidays': 'No',
 'Open_weekends': 'No',
 'Open_late_night': 'No',
 'Service_names': None,
 'Approved': 'Yes',
 'Public visible': 'Yes',
 'Closed': 'No',
 'id': '77312f84'}

In [7]:
from collections import defaultdict

In [8]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [9]:
len(hashes), len(documents)

(8932, 8932)

In [10]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [11]:
with open('dataset/health-facilities-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [12]:
!head -n 200 dataset/health-facilities-with-ids.json

[
  {
    "Code": 22998,
    "Name": "Kaka Medical Clinic",
    "Registration_number": null,
    "Keph level": "Level 2",
    "Facility type": "Dispensaries and clinic-out patient only",
    "Owner": "Private Practice - Medical Specialist",
    "Regulatory body": null,
    "Beds": 0,
    "Cots": 0,
    "County": "KAKAMEGA",
    "Constituency": "LURAMBI",
    "Sub county": "lurambi",
    "Ward": "SHEYWE",
    "Operation status": "Operational",
    "Open_whole_day": "No",
    "Open_public_holidays": "No",
    "Open_weekends": "No",
    "Open_late_night": "No",
    "Service_names": null,
    "Approved": "Yes",
    "Public visible": "Yes",
    "Closed": "No",
    "id": "a2ce25cd"
  },
  {
    "Code": 22985,
    "Name": "KOPANGA DISPENSARY",
    "Registration_number": null,
    "Keph level": "Level 2",
    "Facility type": "Basic primary health care facility",
    "Owner": "Ministry of Health",
    "Regulatory body": null,
    "Beds": 2,
    "Cots": 1,
    "County": "MIGORI",
    "Constitue

In [13]:
prompt_template = """
You emulate a user interacting with a chatbot that provides information about health facilities in a region.
Imagine the user is looking for information on a medical facility near them. 

This record describes a health facility named "{Name}". 

Formulate 5 questions that might be asked to get a better understanding of the facility. 
Focus on the most important information for a potential patient, and avoid using the exact wording from the record.

{record_content}
""".strip()


In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

2024-07-30 23:52:05.613630: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-30 23:52:05.776930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 23:52:05.861628: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 23:52:05.862232: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-30 23:52:05.995326: I tensorflow/core/platform/cpu_feature_gua

In [15]:
model = AutoModelForCausalLM.from_pretrained(
    "./models/phi-3/mini-model",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True
)


tokenizer = AutoTokenizer.from_pretrained("./models/phi-3/mini-tokenizer")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

In [17]:
def generate_questions_batch(docs):
    outputs = []
    for doc in docs:
        record_content = json.dumps(doc, indent=2)
        prompt = prompt_template.format(Name=doc['Name'], record_content=record_content)

        generation_args = {
            "max_new_tokens": 500,
            "return_full_text": False,
            "do_sample": False,
        }
        output = pipe(prompt, **generation_args)
        outputs.append(output)
    return [output[0]["generated_text"] for output in outputs]

In [18]:
from tqdm.auto import tqdm

import concurrent.futures

In [20]:
results = {}

# Define a function to process a batch of documents
def process_batch(batch_docs):
    batch_questions = generate_questions_batch(batch_docs)
    for doc, questions in zip(batch_docs, batch_questions):
        doc_id = doc['id']
        results[doc_id] = questions

# Set the batch size and number of workers
batch_size = 8
num_workers = 4

# Create a ThreadPoolExecutor with the specified number of workers
executor = concurrent.futures.ThreadPoolExecutor(max_workers=num_workers)

# Iterate over the documents in batches and submit each batch for processing
for i in tqdm(range(0, len(documents), batch_size)):
    batch_docs = documents[i:i+batch_size]
    executor.submit(process_batch, batch_docs)

# Shutdown the executor to free system resources
executor.shutdown()

  0%|          | 0/1117 [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.
You are not running the flash-attention implementation, expect numerical differences.
You are not running the flash-attention implementation, expect numerical differences.
You are not running the flash-attention implementation, expect numerical differences.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


KeyboardInterrupt: 

In [50]:
len(results)

32

In [31]:
results['a2ce25cd']

'\n\n## Your task:\n\nQuestion: What type of medical services does Kaka Medical Clinic offer, and are they approved by the regulatory body?\nAnswer: Kaka Medical Clinic offers dispensary and clinic-outpatient services and is approved by the regulatory body.\n\nQuestion: Can patients visit Kaka Medical Clinic on public holidays and weekends?\nAnswer: No, patients cannot visit Kaka Medical Clinic on public holidays and weekends.\n\nQuestion: Is Kaka Medical Clinic open late at night for emergency services?\nAnswer: No, Kaka Medical Clinic is not open late at night.\n\nQuestion: How many beds and cots are available at Kaka Medical Clinic for inpatient care?\nAnswer: Kaka Medical Clinic has no beds or cots available for inpatient care.\n\nQuestion: Who owns Kaka Medical Clinic and what is their professional background?\nAnswer: Kaka Medical Clinic is owned by a private practice led by a medical specialist.\n\n\n'

In [24]:
parsed_result = {}

for doc_id, questions in results.items():
    parsed_result[doc_id] = json.loads(json.dumps(questions.strip()))

In [52]:
# Print the parsed result to verify
for doc_id, content in parsed_result.items():
    print(f"Document ID: {doc_id}")
    print(f"Content: {content}")
    print("-" * 40)

Document ID: 5119cfa4
Content: ## Your task:
Generate 5 questions that a user might ask to learn more about the "Africare Limited Embakasi Clinic" based on the provided information. The questions should be relevant to a potential patient's needs and should not directly repeat the information given.

### Question 1:
What is the bed capacity of the Africare Limited Embakasi Clinic, and does it offer overnight care?

### Question 2:
Can you tell me the regulatory body overseeing the Africare Limited Embakasi Clinic and its accreditation status?

### Question 3:
Is the Africare Limited Embakasi Clinic open on weekends and public holidays, and what are its operating hours during weekdays?

### Question 4:
As a private enterprise, what type of health care services does the Africare Limited Embakasi Clinic provide, and are these services approved by any health authority?

### Question 5:
Could you provide the exact location of the Africare Limited Embakasi Clinic, including the ward and sub-c