In [1]:
%load_ext autoreload
%autoreload 2

In [56]:
import json

from utils.settings import QUESTION_GEN_SYS_TMPL, OPENAI_MODEL_EMBEDDING
import utils.llamaindex_utils  as liu
import utils.sampling_data as sd
from utils.openai_api import generate_qa_file_batch_api, \
    create_json_nodes_llamaindex_batch_api, aprox_costs

# 1. Read and sample data

In [35]:
# Load the sample JSON data for a single patient
with open("./data/Abe604_Runolfsdottir785_3718b84e-cbe9-1950-6c6c-e6f4fdc907be.json", "r") as f:
    json_data = json.load(f)

In [36]:
test = json_data["entry"]
len(test)

7521

In [37]:
test[0].keys()

dict_keys(['fullUrl', 'resource', 'request'])

In [38]:
resource_type_count = {}

for entry in test:
    if "resource" in entry:
        resource_type = entry["resource"].get("resourceType")
        if resource_type:
            if resource_type in resource_type_count:
                resource_type_count[resource_type] += 1
            else:
                resource_type_count[resource_type] = 1

resource_type_count

{'Patient': 1,
 'Encounter': 543,
 'Condition': 63,
 'DiagnosticReport': 767,
 'DocumentReference': 543,
 'Claim': 642,
 'ExplanationOfBenefit': 642,
 'CareTeam': 4,
 'CarePlan': 4,
 'MedicationRequest': 99,
 'Observation': 3517,
 'Procedure': 664,
 'Immunization': 10,
 'SupplyDelivery': 1,
 'Medication': 10,
 'MedicationAdministration': 10,
 'Provenance': 1}

In [39]:
type(json_data)

dict

In [40]:
system_prompt_tokens = sd.get_total_tokens_from_string(
    QUESTION_GEN_SYS_TMPL.format(num_questions_per_chunk=5))

print(f"Total system tokens: {system_prompt_tokens}")


Total system tokens: 322


In [41]:
sampled_data, remaining_data = sd.sample_resources(json_data, system_prompt_tokens, max_entries=1000)

In [70]:
len(sampled_data["entry"])

1000

In [43]:
json_data.keys()

dict_keys(['resourceType', 'type', 'entry'])

In [44]:
json_data["resourceType"]

'Bundle'

In [45]:
json_data["type"]

'transaction'

In [46]:
with open("./data/selected_entries.json", "w") as f:
    json.dump(sampled_data, f, indent=2)
with open("./data/remaining_entries.json", "w") as f:
    json.dump(remaining_data, f, indent=2)

In [47]:
resource_type_count = {}

for entry in sampled_data["entry"]:
    if "resource" in entry:
        resource_type = entry["resource"].get("resourceType")
        if resource_type:
            if resource_type in resource_type_count:
                resource_type_count[resource_type] += 1
            else:
                resource_type_count[resource_type] = 1

resource_type_count

{'Patient': 1,
 'Encounter': 73,
 'Condition': 26,
 'DiagnosticReport': 99,
 'DocumentReference': 74,
 'Claim': 82,
 'ExplanationOfBenefit': 83,
 'CareTeam': 4,
 'CarePlan': 4,
 'MedicationRequest': 27,
 'Observation': 395,
 'Procedure': 101,
 'Immunization': 10,
 'SupplyDelivery': 1,
 'Medication': 10,
 'MedicationAdministration': 10}

In [48]:
sample_open_ai_test = sd.sample_one_per_resource_type(sampled_data)

# 2. Test token lenght of generated response sample

- Llama index: https://docs.llamaindex.ai/en/stable/understanding/loading/loading/
- Improvements:
    - Remover FQDNs

In [4]:
test = """[
    {
        "question": "What is the status of the DiagnosticReport?",
        "answer": "final"
    },
    {
        "question": "What type of notes are included in the category of the DiagnosticReport?",
        "answer": "History and physical note, Evaluation + Plan note"
    },
    {
        "question": "Who is the performer of the DiagnosticReport?",
        "answer": "Dr. Hong136 Kassulke119"
    },
    {
        "question": "What is the effective date and time of the DiagnosticReport?",
        "answer": "1952-01-05T12:25:03+00:00"
    },
    {
        "question": "What is the patient's current insurance status?",
        "answer": "No insurance"
    }
]"""

total_tokens = get_total_tokens_from_string(test)

In [5]:
total_tokens

165

# 3. Sample 10 Open AI Answers

In [49]:
nodes = liu.create_json_nodes_llamaindex_test(sample_open_ai_test)

In [50]:
lenght = []
for node in nodes:
    token_total = sd.get_total_tokens_from_string(node.get_content(metadata_mode="all"), sd.OPENAI_MODEL_EMBEDDING)
    lenght.append(token_total)

In [51]:
lenght

[1252,
 622,
 402,
 1328,
 1736,
 639,
 1706,
 578,
 550,
 498,
 341,
 376,
 315,
 235,
 161,
 290]

In [52]:
result = liu.generate_qa_pairs([nodes[-1]])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [53]:
result["node_0"]["openai_response"]

ChatCompletion(id='chatcmpl-9rFcQE578UBMi2MCAmBPHdI0db8YT', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{"questions_and_answers":[{"question":"What medication was administered to me on February 12, 1998?","answer":"You were administered 1 ML of tacrolimus at a concentration of 5 MG/ML. Tacrolimus is an immunosuppressant medication commonly used to prevent organ rejection in patients who have received a kidney transplant."},{"question":"What is the purpose of tacrolimus in my treatment?","answer":"Tacrolimus is used to suppress your immune system to prevent it from attacking the transplanted kidney. This is crucial after a renal transplant to help ensure that your body accepts the new organ and functions properly."},{"question":"What is the status of my medication administration?","answer":"The status of your medication administration is \'completed,\' which indicates that the administration of tacrolimus was successfully carried 

In [60]:
json.loads(result["node_0"]["openai_response"].choices[0].message.content)

{'questions_and_answers': [{'question': 'What medication was administered to me on February 12, 1998?',
   'answer': 'You were administered 1 ML of tacrolimus at a concentration of 5 MG/ML. Tacrolimus is an immunosuppressant medication commonly used to prevent organ rejection in patients who have received a kidney transplant.'},
  {'question': 'What is the purpose of tacrolimus in my treatment?',
   'answer': 'Tacrolimus is used to suppress your immune system to prevent it from attacking the transplanted kidney. This is crucial after a renal transplant to help ensure that your body accepts the new organ and functions properly.'},
  {'question': 'What is the status of my medication administration?',
   'answer': "The status of your medication administration is 'completed,' which indicates that the administration of tacrolimus was successfully carried out as planned."},
  {'question': 'Why was tacrolimus administered to me?',
   'answer': 'Tacrolimus was administered due to your history 

In [63]:
result["node_0"]["openai_response"].usage.prompt_tokens

690

In [84]:
json.loads(result["node_0"]["openai_response"].strip('\'```').replace('json\n', '').replace('\\n', '').replace('\\t', '').replace('\n', '').replace('\t', ''))

{'questions_and_answers': [{'question': 'What is renal dialysis?',
   'answer': 'Renal dialysis is a medical procedure used to remove waste products and excess fluid from the blood when the kidneys are not functioning properly. It is typically required for patients with end-stage renal disease.'},
  {'question': 'Why did I undergo renal dialysis?',
   'answer': 'You underwent renal dialysis due to end-stage renal disease, which is a condition where your kidneys have lost their ability to effectively filter blood, necessitating dialysis as a life-sustaining treatment.'},
  {'question': 'When was my renal dialysis performed?',
   'answer': 'Your renal dialysis was performed on September 21, 1996, starting at 12:25 PM and concluding at 4:03 PM.'},
  {'question': 'Where did my dialysis session take place?',
   'answer': 'Your dialysis session took place at the Worcester Outpatient Clinic.'},
  {'question': "What does the status 'completed' mean regarding my dialysis procedure?",
   'answer

In [79]:
result["node_0"]["openai_response"]

'```json\n{\n  "questions_and_answers": [\n    {\n      "question": "What is renal dialysis?",\n      "answer": "Renal dialysis is a medical procedure used to remove waste products and excess fluid from the blood when the kidneys are not functioning properly. It is typically required for patients with end-stage renal disease."\n    },\n    {\n      "question": "Why did I undergo renal dialysis?",\n      "answer": "You underwent renal dialysis due to end-stage renal disease, which is a condition where your kidneys have lost their ability to effectively filter blood, necessitating dialysis as a life-sustaining treatment."\n    },\n    {\n      "question": "When was my renal dialysis performed?",\n      "answer": "Your renal dialysis was performed on September 21, 1996, starting at 12:25 PM and concluding at 4:03 PM."\n    },\n    {\n      "question": "Where did my dialysis session take place?",\n      "answer": "Your dialysis session took place at the Worcester Outpatient Clinic."\n    }

In [68]:
nodes[-1].metadata["id"]

'd48f3d68-cd9a-411c-8cc7-fb9afe062aad'

# 4. Generate jsonl file for OpenAI batch API

In [42]:
with open("./data/selected_entries.json", "r") as f:
    json_data = json.load(f)

In [43]:
nodes = create_json_nodes_llamaindex_batch_api(json_data)
len(nodes)

1000

In [76]:
generate_qa_file(nodes=nodes, output_file="./data/batch_api.jsonl")

# 5. Costs

In [4]:
answer = {'questions_and_answers': [{'question': 'What is renal dialysis?',
   'answer': 'Renal dialysis is a medical procedure used to remove waste products and excess fluid from the blood when the kidneys are not functioning properly. It is typically required for patients with end-stage renal disease.'},
  {'question': 'Why did I undergo renal dialysis?',
   'answer': 'You underwent renal dialysis due to end-stage renal disease, which is a condition where your kidneys have lost their ability to effectively filter blood, necessitating dialysis as a life-sustaining treatment.'},
  {'question': 'When was my renal dialysis performed?',
   'answer': 'Your renal dialysis was performed on September 21, 1996, starting at 12:25 PM and concluding at 4:03 PM.'},
  {'question': 'Where did my dialysis session take place?',
   'answer': 'Your dialysis session took place at the Worcester Outpatient Clinic.'},
  {'question': "What does the status 'completed' mean regarding my dialysis procedure?",
   'answer': "The status 'completed' indicates that the renal dialysis procedure was successfully performed and all necessary steps were carried out as intended, and you have completed that treatment session."}]}

In [6]:
sd.get_total_tokens_from_string(json.dumps(answer))

241

In [44]:
nodes[0]

TextNode(id_='e1f3ef42-e2cc-4459-a729-8337382713db', embedding=None, metadata={'resourceType': 'Patient', 'id': '3718b84e-cbe9-1950-6c6c-e6f4fdc907be'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='{"resourceType": "Patient", "id": "3718b84e-cbe9-1950-6c6c-e6f4fdc907be", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient"]}, "text": {"status": "generated", "div": "<div xmlns=\\"http://www.w3.org/1999/xhtml\\">Generated by <a href=\\"https://github.com/synthetichealth/synthea\\">Synthea</a>.Version identifier: 27e32d4\\n .   Person seed: -8963394751129060304  Population seed: 1720471530063</div>"}, "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"system": "urn:oid:2.16.840.1.113883.6.238", "code": "2028-9", "display": "Asian"}}, {"url": "text", "valueString": "Asian"}]}, {"url": "http://hl7.org/fhir/us/core/StructureDefi

In [57]:
costs_batch_api, total_input_tokens, total_output_tokens = aprox_costs(nodes)
print(costs_batch_api, total_input_tokens, total_output_tokens)

0.166 1009520 300000


In [58]:
costs_batch_api = aprox_costs(nodes, cost_per_million_input=0.15, cost_per_million_output=0.6)
print(costs_batch_api)

(0.331, 1009520, 300000)


# TODO

- Cambiar notebooks a scripts
- Dejar lista consulta para con chatgpt
    - limitar generación y revisar textos largos, usar gpt4 mini
- Por qué es tan lento la respuesta con llama.cpp?
- Revisar porqué llama.cpp deja la pregunta incompleta si pasa el máximo contexto
- Terminar entregables con mejoras posteriores y next steps


In [8]:
sum([300]*10)

3000