<a href="https://colab.research.google.com/gist/johnnygreco/ed5fc5fa46a46887adb1ade232da359b/physician-notes-with-non-llm-data-sources.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
%%capture
# Install the latest version of Gretel client and dependencies
%pip install -U git+https://github.com/gretelai/gretel-python-client datasets

In [None]:
from datasets import load_dataset

from gretel_client.navigator_client import Gretel

gretel = Gretel(api_key="prompt", endpoint='https://api.dev.gretel.ai')

## 🏥 Patient Notes Example

In [None]:
# Let's use Gretel's symptom-to-diagnosis dataset to seed our workflow.
df_seed = load_dataset("gretelai/symptom_to_diagnosis")["train"].to_pandas()
df_seed = df_seed.rename(columns={"output_text": "diagnosis", "input_text": "patient_summary"})

print(f"Number of records: {len(df_seed)}")

df_seed.head()

In [None]:
aidd = gretel.data_designer.new(model_suite="apache-2.0")

# We use with_replacement=False, so our max num_records is 853.
aidd.with_seed_dataset(
    df_seed,
    sampling_strategy="shuffle",
    with_replacement=False
)

# Create a couple random person samplers. For now, the
# default locale has been updated to "en_GB", since we
# do not yet support the PGM in streaming mode.
aidd.with_person_samplers({"patient_sampler": {}, "doctor_sampler": {}})

In [None]:
aidd.add_column(
    name="patient_id",
    type="uuid",
    params={"prefix": "PT-", "short_form": True, "uppercase": True},
)

aidd.add_column(
    name="first_name",
    type="expression",
    expr="{{patient_sampler.first_name}}"
)

aidd.add_column(
    name="last_name",
    type="expression",
    expr="{{patient_sampler.last_name}}"
)


aidd.add_column(
    name="dob",
    type="expression",
    expr="{{patient_sampler.birth_date}}"
)


aidd.add_column(
    name="patient_email",
    type="expression",
    expr="{{patient_sampler.email_address}}"
)


aidd.add_column(
    name="symptom_onset_date",
    type="datetime",
    params={"start": "2024-01-01", "end": "2024-12-31"},
)

aidd.add_column(
    name="date_of_visit",
    type="timedelta",
    params={
        "dt_min": 1,
        "dt_max": 30,
        "reference_column_name": "symptom_onset_date"
    },
)

aidd.add_column(
    name="physician",
    type="expression",
    expr="Dr. {{doctor_sampler.first_name}} {{doctor_sampler.last_name}}",
)


# Note we have access to the seed data fields.
aidd.add_column(
    name="physician_notes",
    type="llm-gen",
    prompt="""\
<context>
You are a primary-care physician who just had an appointment with {{first_name}} {{last_name}},
who has been struggling with symptoms from {{diagnosis}} since {{symptom_onset_date}}.
The date of today's visit is {{date_of_visit}}.
</context>

<patient_summary_of_symptoms>
{{patient_summary}}
</patient_summary_of_symptoms>

<task>
Write careful notes about your visit with {{first_name}},
as {{physician}}.

Format the notes as a busy doctor might.
</task>
"""
 )

In [None]:
preview = aidd.preview(verbose_logging=True)

In [None]:
preview.display_sample_record()

In [None]:
# The full dataset includes the seed data as columns.
preview.dataset.df

In [None]:
# workflow_run = aidd.create(
#     num_records=100,
#     workflow_run_name="physician_notes",
#     wait_for_completion=True
# )