<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/main/notebooks/100_note_generation/110_GenerateClientProfiles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GenCare AI: Generating client profiles

**Author:** Eva Rombouts  
**Date:** 2024-06-01  
**Updated:** 2024-09-01  
**Version:** 1.4

### Description
This script generates synthetic healthcare data for NLP experiments. It generates diverse client profiles for a psychogeriatric ward using the OpenAI GPT-4 model. 

The output parser uses a structure called ClientProfile, which is created with Pydantic models. Pydantic helps define and validate the output for each client profile, ensuring that each profile has the right format and contains the necessary information.

The goal is to produce a comprehensive and varied dataset of client profiles for use in a psychogeriatric setting, avoiding repetitive or deterministic outputs. To achieve this, we use GPT-4 with a high temperature setting to enhance variability. Additionally, each query generates multiple profiles to further ensure diversity.

With the current settings of generating eight profiles per query and running the query three times, the cost is approximately $0.05 per run.

In [None]:
!pip install GenCareAI
from GenCareAI.GenCareAIUtils import GenCareAISetup

setup = GenCareAISetup()

if setup.environment == 'Colab':
        !pip install -q langchain langchain_core langchain_openai langchain_community

In [2]:
import os
import pandas as pd
from typing import List
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback

In [5]:
# Constants and Configurations
# The ward name will be used in the filename. Practical when performing multiple
# experiments
WARD_NAME = 'Hermes'
FN_PROFILES =  setup.get_file_path(f'data/gcai_client_profiles_{WARD_NAME}.csv')
# Per query eight profiles are generated. The query is run NUM_WINGS times, so
# when NUM_WINGS is set to 3 the total number of client profiles generated is 24.
NUM_WINGS = 3
# GPT-4o yields better, more diverse results than gpt-3.5
MODEL_PROFILES = 'gpt-4o-2024-05-13'
TEMP = 1.1

In [6]:
# Definition of Pydantic model to structure the client profile data
class ClientProfile(BaseModel):
    naam: str = Field(description="naam van de client (Meneer/Mevrouw Voornaam Achternaam, gebruik een naam die je normaal niet zou kiezen)")
    type_dementie: str = Field(description="type dementie (Alzheimer, gemengde dementie, vasculaire dementie, lewy body dementie, parkinsondementie, FTD: varieer, de kans op Alzheimer, gemengde en vasculaire dementie is het grootst)")
    somatiek: str = Field(description="lichamelijke klachten")
    # biografie: str = Field(description="een korte beschrijving van karakter en relevante biografische gegevens (vermijd stereotypen in beroep en achtergrond)")
    adl: str = Field(description="beschrijf welke ADL hulp de cliënt nodig heeft")
    mobiliteit: str = Field(description="beschrijf de mobiliteit (bv rolstoelafhankelijk, gebruik rollator, valgevaar)")
    gedrag: str = Field(description="beschrijf voor de zorg relevante aspecten van cognitie en probleemgedrag. Varieer met de ernst van het probleemgedrag van rustige cliënten, gemiddeld onrustige cliënten tot cliënten die fors apathisch, onrustig, angstig, geagiteerd of zelfs agressief kunnen zijn")

# Pydantic model to hold multiple client profiles
class ClientProfiles(BaseModel):
    clients: List[ClientProfile]

In [7]:
# Initialize OpenAI model and parser
model= ChatOpenAI(api_key=setup.get_openai_key(), temperature=TEMP, model=MODEL_PROFILES)
pyd_parser = PydanticOutputParser(pydantic_object=ClientProfiles)

In [None]:
PT_client_profiles = PromptTemplate(
    template = """
Schrijf acht profielen van cliënten die zijn opgenomen op een psychogeriatrische afdeling van het verpleeghuis. Hier wonen mensen met een gevorderde dementie met een hoge zorgzwaarte.
Zorg dat de profielen erg van elkaar verschillen.

{format_instructions}
""",
    input_variables=[],
    partial_variables={"format_instructions": pyd_parser.get_format_instructions()},
)

P_client_profiles = PT_client_profiles.format(profile="profile", scenario="scenario")
print(P_client_profiles)

In [9]:
# Combine the prompt, model, and parser into a single chain
chain_client_profiles = PT_client_profiles | model | pyd_parser

In [None]:
# Generate and save data
if not os.path.exists(FN_PROFILES):
    print("Data file not found. Generating new data...")
    
    os.makedirs(os.path.dirname(FN_PROFILES), exist_ok=True)

    def generate_data():
        all_data = []
        for i in range(NUM_WINGS):
            print(f'Generating data for wing {i+1}')
            result = chain_client_profiles.invoke({})
            if result is None or not hasattr(result, 'clients'):
                raise ValueError("No valid response received from the model.")
            data = [client.dict() for client in result.clients]
            all_data.extend(data)
        return pd.DataFrame(all_data)

    def add_client_id(df):
        df['client_id'] = range(1, len(df) + 1)
        return df[['client_id', 'naam', 'type_dementie', 'somatiek', 'adl', 'mobiliteit', 'gedrag']]

    with get_openai_callback() as cb:
        df = generate_data()
        print("Data generated successfully.\n")
        print(cb)

    df_with_id = add_client_id(df)
    df_with_id.to_csv(FN_PROFILES, index=False)
    print(f"Data saved successfully to {FN_PROFILES}.")
else:
    print("Data file found. Loading data...")
    df_with_id = pd.read_csv(FN_PROFILES)