In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
import json

from utils.settings import QUESTION_GEN_SYS_TMPL, OPENAI_MODEL_EMBEDDING
import utils.llamaindex_utils  as liu

# 1. Read and sample data

In [35]:
# Load the sample JSON data for a single patient
with open("./data/Abe604_Runolfsdottir785_3718b84e-cbe9-1950-6c6c-e6f4fdc907be.json", "r") as f:
    json_data = json.load(f)

In [36]:
test = json_data["entry"]
len(test)

7521

In [37]:
test[0].keys()

dict_keys(['fullUrl', 'resource', 'request'])

In [38]:
resource_type_count = {}

for entry in test:
    if "resource" in entry:
        resource_type = entry["resource"].get("resourceType")
        if resource_type:
            if resource_type in resource_type_count:
                resource_type_count[resource_type] += 1
            else:
                resource_type_count[resource_type] = 1

resource_type_count

{'Patient': 1,
 'Encounter': 543,
 'Condition': 63,
 'DiagnosticReport': 767,
 'DocumentReference': 543,
 'Claim': 642,
 'ExplanationOfBenefit': 642,
 'CareTeam': 4,
 'CarePlan': 4,
 'MedicationRequest': 99,
 'Observation': 3517,
 'Procedure': 664,
 'Immunization': 10,
 'SupplyDelivery': 1,
 'Medication': 10,
 'MedicationAdministration': 10,
 'Provenance': 1}

In [39]:
type(json_data)

dict

In [40]:
system_prompt_tokens = sd.get_total_tokens_from_string(
    QUESTION_GEN_SYS_TMPL.format(num_questions_per_chunk=5))

print(f"Total system tokens: {system_prompt_tokens}")


Total system tokens: 322


In [41]:
sampled_data, remaining_data = sd.sample_resources(json_data, system_prompt_tokens, max_entries=1000)

In [70]:
len(sampled_data["entry"])

1000

In [43]:
json_data.keys()

dict_keys(['resourceType', 'type', 'entry'])

In [44]:
json_data["resourceType"]

'Bundle'

In [45]:
json_data["type"]

'transaction'

In [46]:
with open("./data/selected_entries.json", "w") as f:
    json.dump(sampled_data, f, indent=2)
with open("./data/remaining_entries.json", "w") as f:
    json.dump(remaining_data, f, indent=2)

In [47]:
resource_type_count = {}

for entry in sampled_data["entry"]:
    if "resource" in entry:
        resource_type = entry["resource"].get("resourceType")
        if resource_type:
            if resource_type in resource_type_count:
                resource_type_count[resource_type] += 1
            else:
                resource_type_count[resource_type] = 1

resource_type_count

{'Patient': 1,
 'Encounter': 73,
 'Condition': 26,
 'DiagnosticReport': 99,
 'DocumentReference': 74,
 'Claim': 82,
 'ExplanationOfBenefit': 83,
 'CareTeam': 4,
 'CarePlan': 4,
 'MedicationRequest': 27,
 'Observation': 395,
 'Procedure': 101,
 'Immunization': 10,
 'SupplyDelivery': 1,
 'Medication': 10,
 'MedicationAdministration': 10}

In [48]:
sample_open_ai_test = sd.sample_one_per_resource_type(sampled_data)

# 2. Test token lenght of generated response sample

- Llama index: https://docs.llamaindex.ai/en/stable/understanding/loading/loading/
- Improvements:
    - Remover FQDNs

In [4]:
test = """[
    {
        "question": "What is the status of the DiagnosticReport?",
        "answer": "final"
    },
    {
        "question": "What type of notes are included in the category of the DiagnosticReport?",
        "answer": "History and physical note, Evaluation + Plan note"
    },
    {
        "question": "Who is the performer of the DiagnosticReport?",
        "answer": "Dr. Hong136 Kassulke119"
    },
    {
        "question": "What is the effective date and time of the DiagnosticReport?",
        "answer": "1952-01-05T12:25:03+00:00"
    },
    {
        "question": "What is the patient's current insurance status?",
        "answer": "No insurance"
    }
]"""

total_tokens = get_total_tokens_from_string(test)

In [5]:
total_tokens

165

# 3. Sample 10 Open AI Answers

In [35]:
import json

In [36]:
import json


file_path = './data/batch_api_2.jsonl'

data = []

with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

In [40]:
data[100]["body"]["messages"][0]["content"]

"You are a knowledgeable and empathetic healthcare expert specializing in patient communication. Your task is to create up to 5 patient-centered, clear, informative, and contextually accurate questions and answers based on the provided FHIR data context. Focus on generating questions that explore specific details, implications, or next steps related to the context. Questions about medical terminology are valid if they clarify the implications or context-specific significance of a term for the patient. Avoid overly broad questions that simply define statuses or classifications without context-specific relevance. If the context does not provide enough detail to form a specific question, prioritize quality over quantity and produce fewer questions, or none at all, if necessary. Each answer should be detailed, specific, and address all aspects of the patient's question. Avoid general statements and provide thorough explanations where necessary. For example, if explaining a medical term or 

In [41]:
data[100]["body"]["messages"][1]["content"]

'Context information is below.\n---------------------\nresourceType: Condition\nid: ffaba4ab-1a7c-0a56-72bc-b3da840717ec\n\n, "encounter": {"reference": "urn:uuid:53bbaf66-d25d-377d-54ac-0612e8998cd5"}, "onsetDateTime": "1996-11-09T13:01:20+00:00", "abatementDateTime": "1997-05-10T13:00:40+00:00", "recordedDate": "1996-11-09T13:01:20+00:00"}\n---------------------\nGiven the context information and not prior knowledge, generate a JSON with keys and values of the relevant questions and answers, and do not add more information. Your response must be in this format: {"questions_and_answers": [{"question": "example question", "answer": "example answer"}]}'

In [42]:
data[101]["body"]["messages"][1]["content"]

'Context information is below.\n---------------------\nresourceType: Condition\nid: 8326ead1-ed63-67e1-debc-11a30e04bb89\n\n{"resourceType": "Condition", "id": "8326ead1-ed63-67e1-debc-11a30e04bb89", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-condition-encounter-diagnosis"]}, "clinicalStatus": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "resolved"}]}, "verificationStatus": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", "code": "confirmed"}]}\n---------------------\nGiven the context information and not prior knowledge, generate a JSON with keys and values of the relevant questions and answers, and do not add more information. Your response must be in this format: {"questions_and_answers": [{"question": "example question", "answer": "example answer"}]}'

In [44]:
result = liu.generate_qa_pairs(data[100]["body"]["messages"][1]["content"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [46]:
result["openai_response"]

ChatCompletion(id='chatcmpl-9tMbhbKD0OethgghjWZNezgeFizVx', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{"questions_and_answers":[{"question":"What does the onset date of my condition signify?","answer":"The onset date of your condition, November 9, 1996, marks the beginning of your symptoms or diagnosis. This is important for understanding the duration of your condition and for tracking any changes in your health over time."},{"question":"What does the abatement date indicate about my condition?","answer":"The abatement date of your condition, May 10, 1997, indicates that the condition was considered resolved or significantly improved at that time. This suggests that any active symptoms or issues related to the condition were no longer present."},{"question":"What can I infer about the duration of my condition from the recorded dates?","answer":"The duration of your condition, as seen from the onset date of November 9, 1996, to 

In [47]:
json.loads(result["openai_response"].choices[0].message.content)

{'questions_and_answers': [{'question': 'What does the onset date of my condition signify?',
   'answer': 'The onset date of your condition, November 9, 1996, marks the beginning of your symptoms or diagnosis. This is important for understanding the duration of your condition and for tracking any changes in your health over time.'},
  {'question': 'What does the abatement date indicate about my condition?',
   'answer': 'The abatement date of your condition, May 10, 1997, indicates that the condition was considered resolved or significantly improved at that time. This suggests that any active symptoms or issues related to the condition were no longer present.'},
  {'question': 'What can I infer about the duration of my condition from the recorded dates?',
   'answer': 'The duration of your condition, as seen from the onset date of November 9, 1996, to the abatement date of May 10, 1997, lasted approximately six months. This duration is relevant for understanding the impact of the condi

In [68]:
result = liu.generate_qa_pairs(data[100]["body"]["messages"][1]["content"])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [69]:
json.loads(result["openai_response"].choices[0].message.content)

{'questions_and_answers': [{'question': 'What do the onset and abatement dates signify for my condition?',
   'answer': 'The onset date, November 9, 1996, marks when your condition was first identified or presented, indicating the beginning of your symptoms or diagnosis. The abatement date, May 10, 1997, indicates when the condition was considered to have resolved or improved significantly. This timeline shows that your condition was active for approximately six months.'},
  {'question': 'How does the recorded date relate to my diagnosis?',
   'answer': 'The recorded date, which is the same as the onset date (November 9, 1996), indicates that this is when your condition was officially documented in your medical records. This is important for tracking your health history and any related treatments or evaluations you may have undergone.'},
  {'question': 'What should I know about the implications of the duration of my condition?',
   'answer': 'The duration of your condition, from its on

# 6. Test response time using different context lenght and model size

In [50]:
json.loads(result["openai_response"].choices[0].message.content)

{'questions_and_answers': [{'question': 'What does the onset and abatement date of my condition indicate about its duration?',
   'answer': 'The onset date of your condition was on November 9, 1996, and it was noted to have abated on May 10, 1997. This indicates that your condition was active for approximately six months and one day before it resolved.'},
  {'question': 'What does the recorded date tell me about my medical history?',
   'answer': "The recorded date of November 9, 1996, marks the time when your condition was first documented in your medical records. It serves as an important reference point for tracking the condition's progression and treatment over time."},
  {'question': 'What should I know about the significance of the abatement of my condition?',
   'answer': 'The abatement date indicates that your condition resolved or significantly improved as of May 10, 1997. Understanding this timeframe can help you and your healthcare provider assess any long-term effects or th

# 4. Generate jsonl file for OpenAI batch API

In [1]:
import json


with open("./data/selected_entries.json", "r") as f:
    json_data = json.load(f)

In [8]:
json.dumps(json_data["entry"][0]["resource"])

'{"resourceType": "Patient", "id": "3718b84e-cbe9-1950-6c6c-e6f4fdc907be", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient"]}, "text": {"status": "generated", "div": "<div xmlns=\\"http://www.w3.org/1999/xhtml\\">Generated by <a href=\\"https://github.com/synthetichealth/synthea\\">Synthea</a>.Version identifier: 27e32d4\\n .   Person seed: -8963394751129060304  Population seed: 1720471530063</div>"}, "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"system": "urn:oid:2.16.840.1.113883.6.238", "code": "2028-9", "display": "Asian"}}, {"url": "text", "valueString": "Asian"}]}, {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [{"url": "ombCategory", "valueCoding": {"system": "urn:oid:2.16.840.1.113883.6.238", "code": "2186-5", "display": "Not Hispanic or Latino"}}, {"url": "text", "valueString": "Not Hispanic or Latino"}]}

In [43]:
nodes = create_json_nodes_llamaindex_batch_api(json_data)
len(nodes)

1000

In [76]:
generate_qa_file(nodes=nodes, output_file="./data/batch_api.jsonl")

In [14]:
import json

openai_responses = []
with open('./data/batch_3haP1i8Dsfohov5umgZanshX_output.jsonl', 'r') as f:
    for i, line in enumerate(f):
        openai_responses.append(json.loads(line))

In [16]:
for response in openai_responses:
    questions_and_answers = json.loads(
        openai_responses[1]["response"]["body"]["choices"][0]["message"]["content"]
    )["questions_and_answers"]
    if i % 1000 or i == 0:
        print(questions_and_answers)

[{'question': 'What does my ethnic background as Asian mean for my healthcare?', 'answer': 'Your identification as Asian can be relevant in healthcare as it may reflect certain genetic predispositions or risks for specific conditions prevalent in your ethnic group. For instance, some Asian populations may have a higher risk for conditions such as diabetes or hypertension. This knowledge can help healthcare providers tailor screenings, preventive measures, and treatment strategies that are culturally appropriate and based on statistical health trends within your ethnic group.'}, {'question': "What does being categorized as 'Not Hispanic or Latino' imply regarding my health care needs?", 'answer': "Being categorized as 'Not Hispanic or Latino' can help healthcare providers better understand your cultural background, which can influence health behaviors, healthcare preferences, and potential health risks. This classification is often used to ensure that healthcare services are sensitive a

In [8]:
openai_responses[0]

{'id': 'batch_req_RChTRwSmsDXye8n3jSXpmD8i',
 'custom_id': '1',
 'response': {'status_code': 200,
  'request_id': '7760ac1d71fe57412e0c2f8f07a7ca2a',
  'body': {'id': 'chatcmpl-9tcaWEm4kTi1FXF8HlwG9qOOYN3nP',
   'object': 'chat.completion',
   'created': 1723043400,
   'model': 'gpt-4o-mini-2024-07-18',
   'choices': [{'index': 0,
     'message': {'role': 'assistant',
      'content': '{"questions_and_answers": []}',
      'refusal': None},
     'logprobs': None,
     'finish_reason': 'stop'}],
   'usage': {'prompt_tokens': 659,
    'completion_tokens': 7,
    'total_tokens': 666},
   'system_fingerprint': 'fp_48196bc67a'}},
 'error': None}

In [11]:
questions_and_answers = json.loads(
    openai_responses[1]["response"]["body"]["choices"][0]["message"]["content"]
)["questions_and_answers"]

In [12]:
questions_and_answers

[{'question': 'What does my ethnic background as Asian mean for my healthcare?',
  'answer': 'Your identification as Asian can be relevant in healthcare as it may reflect certain genetic predispositions or risks for specific conditions prevalent in your ethnic group. For instance, some Asian populations may have a higher risk for conditions such as diabetes or hypertension. This knowledge can help healthcare providers tailor screenings, preventive measures, and treatment strategies that are culturally appropriate and based on statistical health trends within your ethnic group.'},
 {'question': "What does being categorized as 'Not Hispanic or Latino' imply regarding my health care needs?",
  'answer': "Being categorized as 'Not Hispanic or Latino' can help healthcare providers better understand your cultural background, which can influence health behaviors, healthcare preferences, and potential health risks. This classification is often used to ensure that healthcare services are sensit

In [13]:
for qa in questions_and_answers:
    question = qa["question"]
    print(question)

What does my ethnic background as Asian mean for my healthcare?
What does being categorized as 'Not Hispanic or Latino' imply regarding my health care needs?
How might my mother's maiden name, Toshia520 Yundt842, be relevant to my health records?


# 5. Costs

In [20]:
import json

In [21]:
len(json.dumps(answer))

1179

In [23]:
len(json.dumps(answer).replace("{", "").replace("}", ""))

1167

In [12]:
answer = """A chat between a curious user and an intelligent, polite medical assistant. The assistant provides detailed, helpful answers to the user's medical questions, including accurate references where applicable.<|eot_id|>
<|start_header_id|>user<|end_header_id|>

Context information is below.
{"resourceType": "Observation", "id": "099796cb-a72b-8dbf-2ce5-687ea75937b5", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-observation-lab"]}, "status": "final", "category": [{"coding": [{"system": "http://terminology.hl7.org/CodeSystem/observation-category", "code": "laboratory", "display": "Laboratory"}]}], "code": {"coding": [{"system": "http://loinc.org", "code": "6690-2", "display": "Leukocytes [#/volume] in Blood by Automated count"}], "text": "Leukocytes [#/volume] in Blood by Automated count"}, "subject": {"reference": "urn:uuid:3718b84e-cbe9-1950-6c6c-e6f4fdc907be"}, "encounter": {"reference": "urn:uuid:0b660041-ff12-899e-2d1b-1ef35babff69"}, "effectiveDateTime": "1998-02-05T20:03:15+00:00", "issued": "1998-02-05T20:03:15.545+00:00", "valueQuantity": {"value": 6.8337, "unit": "10*3/uL", "system": "http://unitsofmeasure.org", "code": "10*3/uL"}}
{"resourceType": "Observation", "id": "ae729ca2-3c21-73f7-c285-a02d247f74a9", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-observation-lab"]}, "status": "final", "category": [{"coding": [{"system": "http://terminology.hl7.org/CodeSystem/observation-category", "code": "laboratory", "display": "Laboratory"}]}], "code": {"coding": [{"system": "http://loinc.org", "code": "6690-2", "display": "Leukocytes [#/volume] in Blood by Automated count"}], "text": "Leukocytes [#/volume] in Blood by Automated count"}, "subject": {"reference": "urn:uuid:3718b84e-cbe9-1950-6c6c-e6f4fdc907be"}, "encounter": {"reference": "urn:uuid:41e6f1a1-e651-9416-a037-3ab269149136"}, "effectiveDateTime": "1998-01-21T16:57:00+00:00", "issued": "1998-01-21T16:57:00.545+00:00", "valueQuantity": {"value": 9.9836, "unit": "10*3/uL", "system": "http://unitsofmeasure.org", "code": "10*3/uL"}}
{"resourceType": "Observation", "id": "4b4bb748-338e-3df0-43dc-0692118ec4a1", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-observation-lab"]}, "status": "final", "category": [{"coding": [{"system": "http://terminology.hl7.org/CodeSystem/observation-category", "code": "laboratory", "display": "Laboratory"}]}], "code": {"coding": [{"system": "http://loinc.org", "code": "777-3", "display": "Platelets [#/volume] in Blood by Automated count"}], "text": "Platelets [#/volume] in Blood by Automated count"}, "subject": {"reference": "urn:uuid:3718b84e-cbe9-1950-6c6c-e6f4fdc907be"}, "encounter": {"reference": "urn:uuid:41e6f1a1-e651-9416-a037-3ab269149136"}, "effectiveDateTime": "1998-01-20T16:22:18+00:00", "issued": "1998-01-20T16:22:18.545+00:00", "valueQuantity": {"value": 328.02, "unit": "10*3/uL", "system": "http://unitsofmeasure.org", "code": "10*3/uL"}}
---------------------
Given the context information (if there is any), this is my message: What is a Complete Blood Count (CBC) and why is it important?<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>"""

In [13]:
sd.get_total_tokens_from_string(json.dumps(answer))

1143

In [44]:
nodes[0]

TextNode(id_='e1f3ef42-e2cc-4459-a729-8337382713db', embedding=None, metadata={'resourceType': 'Patient', 'id': '3718b84e-cbe9-1950-6c6c-e6f4fdc907be'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='{"resourceType": "Patient", "id": "3718b84e-cbe9-1950-6c6c-e6f4fdc907be", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient"]}, "text": {"status": "generated", "div": "<div xmlns=\\"http://www.w3.org/1999/xhtml\\">Generated by <a href=\\"https://github.com/synthetichealth/synthea\\">Synthea</a>.Version identifier: 27e32d4\\n .   Person seed: -8963394751129060304  Population seed: 1720471530063</div>"}, "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [{"url": "ombCategory", "valueCoding": {"system": "urn:oid:2.16.840.1.113883.6.238", "code": "2028-9", "display": "Asian"}}, {"url": "text", "valueString": "Asian"}]}, {"url": "http://hl7.org/fhir/us/core/StructureDefi

In [57]:
costs_batch_api, total_input_tokens, total_output_tokens = aprox_costs(nodes)
print(costs_batch_api, total_input_tokens, total_output_tokens)

0.166 1009520 300000


In [58]:
costs_batch_api = aprox_costs(nodes, cost_per_million_input=0.15, cost_per_million_output=0.6)
print(costs_batch_api)

(0.331, 1009520, 300000)


In [2]:
from dotenv import load_dotenv
import os
import random

from transformers import AutoTokenizer


load_dotenv()

True

# TODO

- Cambiar notebooks a scripts
- Dejar lista consulta para con chatgpt
    - limitar generación y revisar textos largos, usar gpt4 mini
- Por qué es tan lento la respuesta con llama.cpp?
- Revisar porqué llama.cpp deja la pregunta incompleta si pasa el máximo contexto
- Terminar entregables con mejoras posteriores y next steps


In [8]:
sum([300]*10)

3000

# Test relations between nodes

In [None]:
def extract_references_recursive(resource):
    references = []

    if isinstance(resource, dict):
        if "reference" in resource:
            references.append(resource["reference"])
        for _, value in resource.items():
            references.extend(extract_references_recursive(value))
    elif isinstance(resource, list):
        for item in resource:
            references.extend(extract_references_recursive(item))

    return references

In [2]:
# Load the sample JSON data for a single patient
with open("./data/Abe604_Runolfsdottir785_3718b84e-cbe9-1950-6c6c-e6f4fdc907be.json", "r") as f:
    json_data = json.load(f)

In [9]:
llama_prompt = "{\"resourceType\": \"Observation\", \"id\": \"527d8281-b130-4459-2023-6fa431179861\", \"meta\": {\"profile\": [\"http://hl7.org/fhir/us/core/StructureDefinition/us-core-observation-lab\"]}, \"status\": \"final\", \"category\": [{\"coding\": [{\"system\": \"http://terminology.hl7.org/CodeSystem/observation-category\", \"code\": \"laboratory\", \"display\": \"Laboratory\"}]}], \"code\": {\"coding\": [{\"system\": \"http://loinc.org\", \"code\": \"33914-3\", \"display\": \"Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum or Plasma by Creatinine-based formula (MDRD)\"}], \"text\": \"Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum or Plasma by Creatinine-based formula (MDRD)\"}, \"subject\": {\"reference\": \"urn:uuid:3718b84e-cbe9-1950-6c6c-e6f4fdc907be\"}, \"encounter\": {\"reference\": \"urn:uuid:a92a56d4-6be1-c556-fc1c-a62ca7f43077\"}, \"effectiveDateTime\": \"1995-11-16T23:04:03+00:00\", \"issued\": \"1995-11-16T23:04:03.545+00:00\", \"valueQuantity\": {\"value\": 17.434, \"unit\": \"mL/min\", \"system\": \"http://unitsofmeasure.org\", \"code\": \"mL/min\"}}"
                                  

In [11]:
llama_prompt

'{"resourceType": "Observation", "id": "527d8281-b130-4459-2023-6fa431179861", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-observation-lab"]}, "status": "final", "category": [{"coding": [{"system": "http://terminology.hl7.org/CodeSystem/observation-category", "code": "laboratory", "display": "Laboratory"}]}], "code": {"coding": [{"system": "http://loinc.org", "code": "33914-3", "display": "Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum or Plasma by Creatinine-based formula (MDRD)"}], "text": "Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum or Plasma by Creatinine-based formula (MDRD)"}, "subject": {"reference": "urn:uuid:3718b84e-cbe9-1950-6c6c-e6f4fdc907be"}, "encounter": {"reference": "urn:uuid:a92a56d4-6be1-c556-fc1c-a62ca7f43077"}, "effectiveDateTime": "1995-11-16T23:04:03+00:00", "issued": "1995-11-16T23:04:03.545+00:00", "valueQuantity": {"value": 17.434, "unit": "mL/min", "system": "http://u

In [10]:
llama_prompt.replace("\\", "")

'{"resourceType": "Observation", "id": "527d8281-b130-4459-2023-6fa431179861", "meta": {"profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-observation-lab"]}, "status": "final", "category": [{"coding": [{"system": "http://terminology.hl7.org/CodeSystem/observation-category", "code": "laboratory", "display": "Laboratory"}]}], "code": {"coding": [{"system": "http://loinc.org", "code": "33914-3", "display": "Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum or Plasma by Creatinine-based formula (MDRD)"}], "text": "Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum or Plasma by Creatinine-based formula (MDRD)"}, "subject": {"reference": "urn:uuid:3718b84e-cbe9-1950-6c6c-e6f4fdc907be"}, "encounter": {"reference": "urn:uuid:a92a56d4-6be1-c556-fc1c-a62ca7f43077"}, "effectiveDateTime": "1995-11-16T23:04:03+00:00", "issued": "1995-11-16T23:04:03.545+00:00", "valueQuantity": {"value": 17.434, "unit": "mL/min", "system": "http://u

In [18]:
a = ", value: 3718b84e-cbe9-1950-6c6c-e6f4fdc907be}, {type: {coding: [{system: http://terminology.hl7.org/CodeSystem/v2-0203, code: MR, display: Medical Record Number}], text: Medical Record Number}, system: http://hospital.smarthealthit.org, value: 3718b84e-cbe9-1950-6c6c-e6f4fdc907be}, {type: {coding: [{system: http://terminology.hl7.org/CodeSystem/v2-0203, code: SS, display: Social Security Number}], text: Social Security Number}, system: http://hl7.org/fhir/sid/us-ssn, value: 999-31-4351}"

In [19]:
len(a)

492

In [13]:
a

', "location": [{"location": {"reference": "Location?identifier=https://github.com/synthetichealth/synthea|3f92fe75-f098-31ff-94e6-884465ffa64a", "display": "Worcester Outpatient Clinic"}}], "serviceProvider": {"reference": "Organization?identifier=https://github.com/synthetichealth/synthea|1e3ca321-fbbb-3546-b9ea-c3761a20324a", "display": "Worcester Outpatient Clinic"}}'

In [10]:
cleaned_a = a.replace('\"', '').replace('\\', '')

In [11]:
len(cleaned_a)

350

In [12]:
cleaned_a

', location: [{location: {reference: Location?identifier=https://github.com/synthetichealth/synthea|3f92fe75-f098-31ff-94e6-884465ffa64a, display: Worcester Outpatient Clinic}}], serviceProvider: {reference: Organization?identifier=https://github.com/synthetichealth/synthea|1e3ca321-fbbb-3546-b9ea-c3761a20324a, display: Worcester Outpatient Clinic}}'