<a href="https://colab.research.google.com/github/ekrombouts/gcai_zuster_fietje/blob/main/notebooks/300_GenCareAISAMPCDatasetCreation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Creating a SAMPC Dataset from GenCareAI Client records

**Author:** Eva Rombouts  
**Date:** 2024-09-16  
**Updated:** 2024-10-16

### Description
This notebook summarizes nursing home client notes into the SAMPC format using OpenAI’s GPT model via LangChain. It processes the data, generates summaries, and prepares the dataset by splitting it into training, validation, and test sets, and uploads it to the Hugging Face Hub for use in machine learning models.

## Environment Setup and Library Imports

In [None]:
!pip install GenCareAIUtils
from GenCareAIUtils import GenCareAISetup

setup = GenCareAISetup()

if setup.environment == 'Colab':
    !pip install -q datasets langchain langchain_community langchain_openai

verbose = True

In [None]:
import os
import random
import pandas as pd
from tqdm import tqdm  # Progress bar
from datasets import load_dataset, Dataset, DatasetDict  # For loading and managing datasets
from typing import List

# Langchain modules
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.callbacks import get_openai_callback

# Pydantic library for data validation
from pydantic import BaseModel, Field

# OpenAI API integration using langchain
from langchain_openai import ChatOpenAI

# Scikit-learn library for splitting datasets into training and testing sets
from sklearn.model_selection import train_test_split


In [None]:
# Set parameters
seed = 6

# Data paths
nursing_care_home_name = "Olympia"
# For reading data
path_hf_records = f"ekrombouts/{nursing_care_home_name}_records"
path_hf_clients = f"ekrombouts/{nursing_care_home_name}_clients"

# For writing data
path_hf_sampc = f"ekrombouts/{nursing_care_home_name}_SAMPC_dataset"
commit_message = "SAMPC dataset"

# File path for saving the generated SAMPC summaries
fn_responses = setup.get_file_path(f"data/care_pal/{nursing_care_home_name}_SAMPC_dataset.pkl")

# Settings for SAMPC summary generation
model = "gpt-4o-mini-2024-07-18"
temperature = 0.3

sep_line = 50 * '-'


## Loading and Preprocessing Data
Client records and notes from fictional clients of a nursing home are loaded, cleaned, and processed. The notes are grouped by week

In [None]:
# Load dataset from Hugging Face and preprocess
dataset = load_dataset(path_hf_records)
df_records = dataset['train'].to_pandas()

# Floor datetime to the first day of the month
df_records['week'] = df_records['datetime'].dt.to_period('W').dt.to_timestamp()

# Group records by 'client_id' and 'month', concatenating notes into one string
df = (df_records
    .dropna()
    .assign(week=lambda df: pd.to_datetime(df['datetime']).dt.to_period('W').dt.to_timestamp()) # Add 'week' column
    .groupby(['client_id', 'week'])
    .agg({'note': lambda x: '\n'.join(x)}) # Concatenate 'note' values
    .reset_index()
    .rename(columns={'note': 'weeknotes'})
)

if verbose:
  print(f"Rows in original df: {df_records.shape[0]}, rows in processed df: {df.shape[0]}\n")
  print(f"SAMPLES{sep_line}\n{df.sample(3)}\n")
  print(f"\nContext column (weeknotes) example:{sep_line}\n{df['weeknotes'].iloc[0]}")


## LLM Response Generation: SAMPC Summaries

In [None]:
# Define the SAMPC model using Pydantic to structure the summarised data
class SAMPC(BaseModel):
    somatiek: List[str] = Field(description="lichamelijke klachten")
    adl: str = Field(description="beschrijf welke hulp de cliënt nodig heeft bij wassen en kleden")
    mobiliteit: str = Field(description="beschrijf de mobiliteit (bv rolstoelafhankelijk, gebruik rollator, valgevaar)")
    continentie: str = Field(description="continentie")
    maatschappelijk: str = Field(description="beschrijf bijzonderheden familie en dagbesteding")
    psychisch: List[str] = Field(description="beschrijf cognitie en probleemgedrag")


# Set up a parser to handle the output and inject instructions into the prompt template
pyd_parser = PydanticOutputParser(pydantic_object=SAMPC)


In [None]:
template="""
Vat de onderstaande rapportages kort en bondig samen om een profiel van de cliënt te schetsen met de volgende categorieën:

Categorieën:
- Somatiek
- Wassen en aankleden
- Mobiliteit
- Continentie
- Maatschappelijk
- Psychisch

Belangrijk:
- Gebruik uitsluitend informatie uit de rapportages. Voeg geen eigen interpretaties toe.
- Als er geen informatie beschikbaar is voor een categorie, noteer dan 'geen informatie beschikbaar' voor die categorie.
- Richt je op algemene observaties en patronen, zonder de details van de rapportages over te nemen.

---
RAPPORTAGES:
{rapportages}
---

{format_instructions}
"""

prompt_template = PromptTemplate(
    template=template,
    input_variables=["rapportages"],
    partial_variables={"format_instructions": pyd_parser.get_format_instructions()},
)


In [None]:
# Initialize OpenAI Chat model
llm = ChatOpenAI(
    api_key=setup.get_openai_key(),
    model=model,
    temperature=temperature
)

chain = prompt_template | llm | pyd_parser


if verbose:
    sample_id = 50
    sample_context = df['weeknotes'].iloc[sample_id]

    sample_prompt = template.format(
            rapportages=sample_context,
            format_instructions=pyd_parser.get_format_instructions()
    )

    result = chain.invoke({"rapportages": sample_context})

    print(sample_prompt)
    print("RESPONSE")
    print("Somatiek:\t", result.somatiek)
    print("ADL:\t\t", result.adl)
    print("Mobiliteit:\t", result.mobiliteit)
    print("Continentie:\t", result.continentie)
    print("Maatschappelijk:", result.maatschappelijk)
    print("Psychisch:\t", result.psychisch)


In [None]:
# # Function to generate the SAMPC summary
# def generate_sampc_summary(notes: str) -> SAMPC:
#     result = chain.invoke({"rapportages": notes})
#     return result

# # Generate Summaries
# sampc_results = []

# with get_openai_callback() as cb:
#   # Loop through the 'notes' column and generate SAMPC summaries, store them in the list
#   for notes in tqdm(df['notes'], desc="Generating SAMPC summaries"):
#       sampc_summary = generate_sampc_summary(notes)
#       sampc_results.append(sampc_summary)
#   print(cb)

In [None]:
df.info()

In [None]:
# Function to generate the SAMPC summary with error handling
def generate_sampc_summary(notes: str) -> SAMPC:
    try:
        result = chain.invoke({"rapportages": notes})
        return result
    except Exception as e:
        print(f"Error generating SAMPC summary: {e}")
        return None

# Load the previously saved dataframe if it exists, otherwise start fresh
if os.path.exists(fn_responses):
    df = pd.read_pickle(fn_responses)
else:
    df['sampc_response'] = None  # Ensure the column exists

# Create a callback instance to track cost
with get_openai_callback() as cb:

    # Generate Summaries, process only new entries
    with tqdm(total=len(df), desc="Generating SAMPC summaries") as pbar:  # Set the total for the progress bar
        for idx, row in df.iterrows():
            if pd.isna(df.at[idx, 'sampc_response']):  # Process only new rows or missing responses
                sampc_summary = generate_sampc_summary(row['weeknotes'])
                df.at[idx, 'sampc_response'] = sampc_summary

            # Update the progress bar
            pbar.update(1)

            # Save progress every 10 iterations
            if idx % 10 == 0:
                df.to_pickle(fn_responses)
                print(f"Checkpoint saved at index {idx}, total cost so far: ${cb.total_cost:.4f}")

    # Save the final result
    df.to_pickle(fn_responses)
    print("Processing complete and final dataframe saved.")
    print(f"Total cost: ${cb.total_cost:.4f}")


Generating SAMPC summaries:  18%|█▊        | 311/1758 [14:43<2:07:42,  5.30s/it]

Checkpoint saved at index 310, total cost so far: $0.0673


## Dataset Creation, Splitting and Saving

In [None]:
# Convert results to DataFrame and save
df_sampc = pd.DataFrame([s.dict() for s in sampc_results])
df_sampc['client_id'] = df['client_id']
df_sampc['week'] = df['week']
df_sampc['notes'] = df['notes']

# Reorder columns
df_sampc = df_sampc[['client_id', 'week', 'notes', 'somatiek', 'adl', 'mobiliteit', 'continentie', 'maatschappelijk', 'psychisch']]

In [None]:
# # Split the dataset and push to Hugging Face hub

# # Convert df to Hugging Face dataset
# dataset = Dataset.from_pandas(
#     df=df_sampc,
#     preserve_index=False
# )

# # Split the dataset into training(80%), validation(10%), and test(10%) sets
# train_testvalid_split = dataset.train_test_split(
#     test_size=0.2,
#     seed=seed
# )
# test_valid_split = train_testvalid_split['test'].train_test_split(
#     test_size=0.5,
#     seed=seed
# )

# dataset_dict = DatasetDict({
#     'train': train_testvalid_split['train'],
#     'validation': test_valid_split['train'],
#     'test': test_valid_split['test'],
# })

# # # Push the dataset to Hugging Face Hub
# # dataset_dict.push_to_hub(path_hf_sampc,
# #                          commit_message=commit_message,
# #                          private=True)

