<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/olympia/notebooks/100_note_generation/120_GenerateClientScenarios.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GenCare AI: Generating client scenarios

**Author:** Eva Rombouts  
**Date:**   13-06-2024  
**Updated:** 2024-10-07  
**Version:** 2.0

### Description
In previous notebooks, we created a dataset with synthetic progress notes and client profiles for nursing home residents. In this notebook, we will generate client scenarios that describe the course of events during their stay in a psychogeriatric ward. These scenarios aim to provide a timeline of care over several weeks, including complications that may arise during the client’s time in the nursing home.

Our goal is to simulate the subtle changes that occur over time in a resident’s health and care needs. Each scenario is generated based on a client profile and includes complications such as weight loss, infections, or other health-related issues. The number of weeks and complications vary to reflect the unpredictability of real-life care trajectories.

In this notebook, we use the gpt-4o-mini model to generate these scenarios. The temperature is set to 1.1 to promote variation in the generated content. The ward name is defined to allow for multiple experiments, and the number of weeks is drawn from a normal distribution to ensure variability in the duration of each client’s scenario.

This scripts generates client scenarios based on profiles generated [here](todo).

Generating scenarios based on 24 client profiles and 20 weeks, the cost is approximately $?? per run.

In [None]:
# Setup dependencies based on environment (e.g., Colab)
!pip install GenCareAI
from GenCareAI.GenCareAIUtils import GenCareAISetup

setup = GenCareAISetup()

if setup.environment == 'Colab':
        !pip install -q langchain langchain_core langchain_openai langchain_community

In [2]:
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.callbacks import get_openai_callback
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from GenCareAI import ClientProfileFormatter

import os
import pandas as pd
import random
import numpy as np
from tqdm import tqdm

In [3]:
# Initialize constants for scenario generation
ward_name = 'Hermes' 
fn_profiles = setup.get_file_path(f'data/gcai_client_profiles_{ward_name}.csv')
fn_scenarios = setup.get_file_path(f'data/gcai_client_scenarios_{ward_name}.csv')

model_name = 'gpt-4o-mini-2024-07-18'
temp = 1.1

duration = 20 # Number of weeks to simulate
duration_sd = 6 # Standard deviation of the number of weeks
num_complications_min = 1
num_complications_max = 3

# List of complications to be randomly assigned to clients
complications_library = [
    "gewichtsverlies", 
    "algehele achteruitgang", 
    "decubitus", 
    "urineweginfectie", 
    "pneumonie", 
    "delier", 
    "verergering van onderliggende lichamelijke klachten", 
    "verbetering van de klachten", 
    "overlijden", 
    "valpartij"
]

verbose = False


In [4]:
# Load the client profiles
df = pd.read_csv(fn_profiles)

In [5]:
# Define the Pydantic models for handling scenario outputs
class ClientScenario(BaseModel):
    week: str = Field(description="Weeknummer")
    events_description: str = Field(description="Beschrijving van de gebeurtenissen en zorg")

class ClientScenarios(BaseModel):
    scenario: List[ClientScenario]

In [6]:
# Initialize model and parser
model = ChatOpenAI(api_key=setup.get_openai_key(), temperature=temp, model=model_name)
pyd_parser = PydanticOutputParser(pydantic_object=ClientScenarios)
format_instructions = pyd_parser.get_format_instructions()

In [None]:
# Define the prompt template
template="""
Dit is het profiel van een fictieve client in het verpleeghuis:
---
{client_profile}
---

Schrijf in een tijdlijn het beloop van zijn/haar verblijf in het verpleeghuis gedurende {num_weeks} weken.
Verwerk de volgende complicatie(s) hierin: {complications}.

Instructies:
- Zorg dat een individuele scenarioregel wordt begrepen door een taalmodel. Het scenario zal later worden gebruikt voor het genereren van fictieve rapportages.
- Hou wijzigingen subtiel. Vermijd al te grote dramatiek. 

{format_instructions}
"""

prompt_template = PromptTemplate(
    template=template,
    input_variables=["client_profile", "num_weeks", "complications"],
    partial_variables={"format_instructions": format_instructions},
)

if verbose:
    print(prompt_template.format(client_profile="client profiel",
                                 num_weeks = 21,
                                 complications = "complicatie(s)"))


In [8]:
# Create a chain of operations: prompt template -> model -> output parser
chain_scenario = prompt_template | model | pyd_parser

In [None]:
# Generate and save scenarios
if not os.path.exists(fn_scenarios):
    print("Data file not found. Generating new data...")

    def generate_scenarios(df, chain):
        from GenCareAI import ClientProfileFormatter
        cpf = ClientProfileFormatter()

        # Function to determine the scenario duration (weeks) based on normal distribution
        def determine_duration(mean=6, std_dev=2):
            return int(np.round(np.random.normal(mean, std_dev)))

        # Function to determine the number of complications to be included
        def determine_num_complications(min=1, max=3):
            return random.randint(min, max)

        scenario_list = []
        for _, row in tqdm(df.iterrows(), total = df.shape[0], desc="Generating Scenario's"):
            # Format the client profile
            client_profile = cpf.format_client_profile(
                profile_row=row,
                display_name=False
            )
            # print(f"Generating scenario for client: {row['naam']}")
            # Determine the number of weeks and complications for the scenario
            num_weeks = determine_duration(mean=duration, std_dev=duration_sd)
            num_complications = determine_num_complications(min=num_complications_min, max=num_complications_max)
            chosen_complications = random.sample(complications_library, num_complications)
            complications = ", ".join(chosen_complications)

            # Invoke the model. 
            # Errors are frequently due to incorrectly formatted responses, causing parsing errors. A simple retry often does the trick.
            try:
                result = chain.invoke({"client_profile": client_profile, "num_weeks": str(num_weeks), "complications": complications})
            except Exception as e:
                print(f"Error encountered: {e}. Retrying...")
                result = chain.invoke({"client_profile": client_profile, "num_weeks": str(num_weeks), "complications": complications})
                print("Retry successful")

            # Store the results in the scenario_list
            for scenario in result.scenario:
                scenario_list.append((row['client_id'], scenario.week, scenario.events_description, complications, num_weeks))
        return scenario_list

    with get_openai_callback() as cb:
        scenario_data = generate_scenarios(df, chain_scenario)
        print(cb)

    df_scenarios = pd.DataFrame(scenario_data, columns=['client_id', 'week', 'events_description', 'complications', 'num_weeks'])
    df_scenarios.to_csv(fn_scenarios, index=False)
    print(f"Data saved successfully to {fn_scenarios}.")
else:
    print("Data file found. Loading data...")
    df_scenarios = pd.read_csv(fn_scenarios)

In [None]:
if verbose:
    sample_client = 3
    cpf = ClientProfileFormatter()

    print(cpf.format_client_profile(
        profile_row=df[df['client_id'] == sample_client].iloc[0])
    )

    print(100*'-')

    ct_scens = df_scenarios[df_scenarios['client_id'] == sample_client][['week', 'events_description']]
    for i, s in enumerate(ct_scens.itertuples(), 1):
        print(f"{i}. {s.events_description}")