<a href="https://colab.research.google.com/gist/johnnygreco/ed5fc5fa46a46887adb1ade232da359b/physician-notes-with-non-llm-data-sources.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
%%capture
# Install the latest version of Gretel client and dependencies
%pip install -U git+https://github.com/gretelai/gretel-python-client networkx datasets

In [7]:
from datasets import load_dataset

from gretel_client.navigator_client import Gretel

gretel = Gretel(api_key="prompt", endpoint='https://api.dev.gretel.ai')

Found cached Gretel credentials
Logged in as kirit.thadaka@gretel.ai ✅
Gretel client configured to use project: proj_2uY0cfM0kjiegpyEZvCHNKZYxGf


## 🏥 Patient Notes Example

In [8]:
# Let's use Gretel's symptom-to-diagnosis dataset to seed our workflow.
df_seed = load_dataset("gretelai/symptom_to_diagnosis")["train"].to_pandas()
df_seed = df_seed.rename(columns={"output_text": "diagnosis", "input_text": "patient_summary"})

print(f"Number of records: {len(df_seed)}")

df_seed.head()

Number of records: 853


Unnamed: 0,diagnosis,patient_summary
0,cervical spondylosis,I've been having a lot of pain in my neck and ...
1,impetigo,I have a rash on my face that is getting worse...
2,urinary tract infection,I have been urinating blood. I sometimes feel ...
3,arthritis,I have been having trouble with my muscles and...
4,dengue,I have been feeling really sick. My body hurts...


In [9]:
aidd = gretel.data_designer.new(model_suite="apache-2.0")

# We use with_replacement=False, so our max num_records is 853.
aidd.with_seed_dataset(
    df_seed,
    sampling_strategy="shuffle",
    with_replacement=False
)

# Create a couple random person samplers. For now, the
# default locale has been updated to "en_GB", since we
# do not yet support the PGM in streaming mode.
aidd.with_person_samplers({"patient_sampler": {}, "doctor_sampler": {}})

[19:07:18] [INFO] 🌱 Using seed dataset with file ID: file_7dc363b601dd4e52ad04d033a19418f4


In [13]:
aidd.add_column(
    name="patient_id",
    type="uuid",
    params={"prefix": "PT-", "short_form": True, "uppercase": True},
)

aidd.add_column(
    name="first_name",
    type="expression",
    expr="{{patient_sampler.first_name}}"
)

aidd.add_column(
    name="last_name",
    type="expression",
    expr="{{patient_sampler.last_name}}"
)


aidd.add_column(
    name="dob",
    type="expression",
    expr="{{patient_sampler.birth_date}}"
)


aidd.add_column(
    name="patient_email",
    type="expression",
    expr="{{patient_sampler.email_address}}"
)


aidd.add_column(
    name="symptom_onset_date",
    type="datetime",
    params={"start": "2024-01-01", "end": "2024-12-31"},
)

aidd.add_column(
    name="date_of_visit",
    type="timedelta",
    params={
        "dt_min": 1,
        "dt_max": 30,
        "reference_column_name": "symptom_onset_date"
    },
)

aidd.add_column(
    name="physician",
    type="expression",
    expr="Dr. {{doctor_sampler.first_name}} {{doctor_sampler.last_name}}",
)


# Note we have access to the seed data fields.
aidd.add_column(
    name="physician_notes",
    type="llm-gen",
    prompt="""\
<context>
You are a primary-care physician who just had an appointment with {{first_name}} {{last_name}},
who has been struggling with symptoms from {{diagnosis}} since {{symptom_onset_date}}.
The date of today's visit is {{date_of_visit}}.
</context>

<patient_summary_of_symptoms>
{{patient_summary}}
</patient_summary_of_symptoms>

<task>
Write careful notes about your visit with {{first_name}},
as {{physician}}.

Format the notes as a busy doctor might.
</task>
"""
 )

In [14]:
preview = aidd.preview(verbose_logging=True)

[19:08:34] [INFO] 🚀 Generating preview
[19:08:35] [INFO] ⛓️ Representing generation steps as a Directed Acyclic Graph
[19:08:35] [INFO]   |-- 🔗 `physician_notes` depends on `last_name`
[19:08:35] [INFO]   |-- 🔗 `physician_notes` depends on `first_name`
[19:08:35] [INFO]   |-- 🔗 `physician_notes` depends on `physician`
[19:08:37] [INFO] 🎲 Step 1: Sample from dataset
[19:08:38] [INFO]   |-- 🎲 Sampling 10 records from input dataset
[19:08:39] [INFO] 🦜 Step 2: Generate columns using samplers
[19:08:39] [INFO]   |-- 🎲 🧑‍🚀 Creating person generator
[19:08:39] [INFO]   |-- 🎲 Using numerical samplers to generate 10 records across 5 columns
[19:08:44] [INFO] 🔗 Step 3: Concat datasets
[19:08:44] [INFO]   |-- (💾 + 💾) Concatenating 2 datasets
[19:08:44] [INFO] 🦜 Step 4: Generate column from expression
[19:08:44] [INFO]   |-- 🧩 Generating column `last_name` from expression
[19:08:45] [INFO] 🦜 Step 5: Generate column from expression 1
[19:08:45] [INFO]   |-- 🧩 Generating column `first_name` from exp

In [None]:
preview.display_sample_record()

In [None]:
# The full dataset includes the seed data as columns.
preview.dataset.df

In [None]:
# workflow_run = aidd.create(
#     num_records=100,
#     workflow_run_name="physician_notes",
#     wait_for_completion=True
# )