<a href="https://colab.research.google.com/github/ekrombouts/gcai_zuster_fietje/blob/main/notebooks/300_GenCareAICarePlanGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Creating a Careplan Dataset from GenCareAI Client records

**Author:** Eva Rombouts  
**Date:** 2024-09-16  
**Updated:** 2024-11-30

### Description
This notebook summarizes nursing home client notes into the Careplan format using OpenAI’s GPT model via LangChain. It processes the data, generates careplans, and prepares the dataset by splitting it into training, validation, and test sets, and uploads it to the Hugging Face Hub for use in machine learning models.

## Environment Setup and Library Imports

In [None]:
# # When in Colab
# from google.colab import drive, userdata
# import os

# drive.mount('/content/drive')
# base_dir = "/content/drive/My Drive/Colab Notebooks/GenCareAI"
# open_ai_api_key = userdata.get("GCI_OPENAI_API_KEY")

# !pip install -q datasets langchain langchain_community langchain_openai

In [None]:
# When in local
import os
from pathlib import Path

base_dir = Path(os.getcwd()).resolve().parents[0]
open_ai_api_key = os.getenv("GCI_OPENAI_API_KEY")

In [None]:
import os
import random
import pandas as pd
from tqdm import tqdm  # Progress bar
from datasets import load_dataset, Dataset, DatasetDict  # For loading and managing datasets
from typing import List

# Langchain modules
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.callbacks import get_openai_callback

# Pydantic library for data validation
from pydantic import BaseModel, Field

# OpenAI API integration using langchain
from langchain_openai import ChatOpenAI

# Scikit-learn library for splitting datasets into training and testing sets
from sklearn.model_selection import train_test_split

verbose = True

In [None]:
# Set parameters
seed = 6

# Data paths
nursing_care_home_name = "Gardenia"
# For reading data
path_hf_records = f"ekrombouts/{nursing_care_home_name}_records"
path_hf_clients = f"ekrombouts/{nursing_care_home_name}_clients"

# For writing data
path_hf_careplan = f"ekrombouts/{nursing_care_home_name}_Careplan_dataset"
commit_message = "Careplan dataset. Created: https://colab.research.google.com/github/ekrombouts/gcai_zuster_fietje/blob/main/notebooks/300_GenCareAICarePlanGeneration.ipynb"

# File path for saving the generated Careplans
fn_responses = os.path.join(base_dir, f"data/care_pal/{nursing_care_home_name}_Careplan_dataset.pkl")

# Settings for Careplan generation
model = "gpt-4o-mini-2024-07-18"
temperature = 0.3

sep_line = 50 * '-'


## Loading and Preprocessing Data
Client records and notes from fictional clients of a nursing home are loaded, cleaned, and processed. The notes are grouped by week

In [None]:
# Load dataset from Hugging Face and preprocess
dataset = load_dataset(path_hf_records)
df_records = dataset['train'].to_pandas()

# Floor datetime to the first day of the month
df_records['week'] = df_records['datetime'].dt.to_period('W').dt.to_timestamp()

# Group records by 'client_id' and 'week', concatenating notes into one string
df = (df_records
    .dropna()
    .assign(week=lambda df: pd.to_datetime(df['datetime']).dt.to_period('W').dt.to_timestamp()) # Add 'week' column
    .groupby(['client_id', 'week'])
    .agg({'note': lambda x: '\n'.join(x)}) # Concatenate 'note' values
    .reset_index()
    .rename(columns={'note': 'weeknotes'})
)

if verbose:
  print(f"Rows in original df: {df_records.shape[0]}, rows in processed df: {df.shape[0]}\n")
  print(f"SAMPLES{sep_line}\n{df.sample(3)}\n")
  print(f"\nContext column (weeknotes) example:{sep_line}\n{df['weeknotes'].iloc[0]}")


## LLM Response Generation: Careplans

In [None]:
class CarePlanItem(BaseModel):
    problem: str = Field(..., description="Beschrijving van het zorgprobleem. Zorg dat er slechts één probleem wordt beschreven")
    care_goal: str = Field(..., description="Beschrijving van het zorgdoel")
    interventions: List[str] = Field(..., description="Beschrijving van 1 tot max 3 interventies")

class CarePlan(BaseModel):
    careplan: List[CarePlanItem] = Field(..., description="Lijst van 1 tot max 3 zorgdoelen en interventies")

# Set up a parser to handle the output and inject instructions into the prompt template
pyd_parser = PydanticOutputParser(pydantic_object=CarePlan)

In [None]:
template="""
Schrijf een zorgplan op basis van onderstaande rapportages.

Belangrijk:
- Gebruik uitsluitend informatie uit de rapportages. Voeg geen eigen interpretaties toe.
- Richt je op algemene observaties en patronen, zonder de details van de rapportages over te nemen.

---
RAPPORTAGES:
{rapportages}
---

{format_instructions}
"""

prompt_template = PromptTemplate(
    template=template,
    input_variables=["rapportages"],
    partial_variables={"format_instructions": pyd_parser.get_format_instructions()},
)


In [None]:

llm = ChatOpenAI(
    api_key=open_ai_api_key,
    model=model,
    temperature=temperature
)

chain = prompt_template | llm | pyd_parser


if verbose:
    sample_id = 5
    sample_context = df['weeknotes'].iloc[sample_id]

    sample_prompt = template.format(
            rapportages=sample_context,
            format_instructions=pyd_parser.get_format_instructions()
    )

    result = chain.invoke({"rapportages": sample_context})

    # print the CarePlan
    print(sample_prompt)

    print("RESPONSE")

    for i, item in enumerate(result.careplan):
        print(f"Probleem {i+1}:") #Added problem number
        print(item.problem)
        print(item.care_goal)
        for intervention in item.interventions:
            print(f"- {intervention}")

In [None]:
# Function to generate the Careplan
def generate_careplan(notes: str) -> CarePlan:
    try:
        result = chain.invoke({"rapportages": notes})
        return result
    except Exception as e:
        print(f"Error generating Careplan: {e}")
        return None

# Load the previously saved dataframe if it exists, otherwise start fresh
if os.path.exists(fn_responses):
    df = pd.read_pickle(fn_responses)
else:
    df['careplan_response'] = None  # Ensure the column exists

# Create a callback instance to track cost
with get_openai_callback() as cb:

    # Generate Summaries, process only new entries
    with tqdm(total=len(df), desc="Generating Careplans") as pbar:  # Set the total for the progress bar
        for idx, row in df.iterrows():
            if pd.isna(df.at[idx, 'careplan_response']):  # Process only new rows or missing responses
                careplan = generate_careplan(row['weeknotes'])
                df.at[idx, 'careplan_response'] = careplan

            # Update the progress bar
            pbar.update(1)

            # Save progress every 100 iterations
            if idx % 100 == 0:
                df.to_pickle(fn_responses)
                print(f"Checkpoint saved at index {idx}, total cost so far: ${cb.total_cost:.4f}")

    # Save the final result
    df.to_pickle(fn_responses)
    print("Processing complete and final dataframe saved.")
    print(f"Total cost: ${cb.total_cost:.4f}")


## Dataset Creation, Splitting and Saving

In [None]:
instruction = '''Schrijf een zorgplan op basis van onderstaande rapportages. Gebruik alleen informatie uit de rapportages, zonder eigen interpretaties toe te voegen.

Formatteer de output als een JSON-instantie die voldoet aan het onderstaande JSON-schema.
```
{”$defs”:{“CarePlanItem”:{“properties”:{“problem”:{“title”:“Problem”,“type”:“string”},“care_goal”:{“title”:“Care Goal”,“type”:“string”},“interventions”:{“title”:“Interventions”,“type”:“array”,“items”:{“type”:“string”}}},“required”:[“problem”,“care_goal”,“interventions”],“title”:“CarePlanItem”,“type”:“object”}},“properties”:{“careplan”:{“title”:“Careplan”,“type”:“array”,“items”:{”$ref”:”#/$defs/CarePlanItem”}}},“required”:[“careplan”]}
```
'''

In [None]:
# use method chaining to rename columns and reorder
df_careplan = (
    df.rename(columns={'weeknotes': 'context', 'careplan_response': 'response'})
    .assign(
        response=lambda df: df['response'].astype(str),
        instruction=instruction
    )
    [['client_id', 'week', 'context', 'instruction', 'response']]
)

In [None]:
# Split the dataset and push to Hugging Face hub

# Convert df to Hugging Face dataset
dataset = Dataset.from_pandas(
    df=df_careplan,
    preserve_index=False
)

# Split the dataset into training(80%), validation(10%), and test(10%) sets
train_testvalid_split = dataset.train_test_split(
    test_size=0.2,
    seed=seed
)
test_valid_split = train_testvalid_split['test'].train_test_split(
    test_size=0.5,
    seed=seed
)

dataset_dict = DatasetDict({
    'train': train_testvalid_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test'],
})

# Push the dataset to Hugging Face Hub
dataset_dict.push_to_hub(path_hf_careplan,
                         commit_message=commit_message,
                         private=True)



In [None]:
dataset_dict ['test'][6]