<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/zuster_fietje/notebooks/300_zuster_fietje/100_sampc_summary_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summarisation: SAMPC

**Author:** Eva Rombouts  
**Date:** 2024-09-16  

### Description
This notebook summarizes nursing home client notes into the SAMPC format using OpenAI’s GPT model via LangChain. It processes the data, generates summaries, and prepares the dataset by splitting it into training, validation, and test sets, and uploads it to the Hugging Face Hub for use in machine learning models.

In [None]:
# Install necessary libraries
!pip install -q datasets langchain langchain_community langchain_openai

# Import necessary modules
from google.colab import drive, userdata

import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset, DatasetDict
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.callbacks import get_openai_callback
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI

from sklearn.model_selection import train_test_split

In [None]:
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [None]:
# Set parameters
model = "gpt-4o-mini-2024-07-18"
temperature = 0.3
seed = 6
path_hf_clientrecords = "ekrombouts/Galaxy_records"
path_hf_sampc = "ekrombouts/Galaxy_SAMPC"
commit_message = "Grouped notes by week"

In [None]:
# Load and Preprocess the Dataset

# Load dataset from Hugging Face
dataset = load_dataset(path_hf_clientrecords)
df_records = dataset['train'].to_pandas()

# Floor datetime to the first day of the month
df_records['week'] = df_records['datetime'].dt.to_period('W').dt.to_timestamp()

# Group records by 'ct_id' and 'month', concatenating notes into one string
df = (
    df_records
    .groupby(['ct_id', 'week'])
    .agg({'note': lambda x: '\n'.join(x)})
    .reset_index()
    .rename(columns={'note': 'notes'})
)

In [None]:
# Define the SAMPC model using Pydantic to structure the summarised data
class SAMPC(BaseModel):
    somatiek: List[str] = Field(description="lichamelijke klachten")
    adl: str = Field(description="beschrijf welke hulp de cliënt nodig heeft bij wassen en kleden")
    mobiliteit: str = Field(description="beschrijf de mobiliteit (bv rolstoelafhankelijk, gebruik rollator, valgevaar)")
    continentie: str = Field(description="continentie")
    maatschappelijk: str = Field(description="beschrijf bijzonderheden familie en dagbesteding")
    psychisch: List[str] = Field(description="beschrijf cognitie en probleemgedrag")

In [None]:
# Initialize the model and set up the Prompt Template

# Initialize OpenAI Chat model
model = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=temperature, model=model)

# Set up a parser to handle the output and inject instructions into the prompt template
pyd_parser = PydanticOutputParser(pydantic_object=SAMPC)


prompt_template = PromptTemplate(
    template="""
Vat de onderstaande rapportages kort en bondig samen om een profiel van de cliënt te schetsen met de volgende categorieën:

Categorieën:
- Somatiek
- Wassen en aankleden
- Mobiliteit
- Continentie
- Maatschappelijk
- Psychisch

Belangrijk:
- Gebruik uitsluitend informatie uit de rapportages. Voeg geen eigen interpretaties toe.
- Als er geen informatie beschikbaar is voor een categorie, noteer dan 'geen informatie beschikbaar' voor die categorie.
- Richt je op algemene observaties en patronen, zonder de details van de rapportages over te nemen.

---
RAPPORTAGES:
{rapportages}
---

{format_instructions}
""",
    input_variables=["rapportages"],
    partial_variables={"format_instructions": pyd_parser.get_format_instructions()},
)

In [None]:
# Create the Chain and Define the Function to Generate Summaries

# Initialize the chain
chain = prompt_template | model | pyd_parser

# Function to generate the SAMPC summary
def generate_sampc_summary(notes: str) -> SAMPC:
    result = chain.invoke({"rapportages": notes})
    return result

In [None]:
# Generate Summaries
sampc_results = []

with get_openai_callback() as cb:
  # Loop through the 'notes' column and generate SAMPC summaries, store them in the list
  for notes in tqdm(df['notes'], desc="Generating SAMPC summaries"):
      sampc_summary = generate_sampc_summary(notes)
      sampc_results.append(sampc_summary)
  print(cb)

In [None]:
# Convert results to DataFrame and save
df_sampc = pd.DataFrame([s.dict() for s in sampc_results])
df_sampc['ct_id'] = df['ct_id']
df_sampc['week'] = df['week']
df_sampc['notes'] = df['notes']

# Reorder columns
df_sampc = df_sampc[['ct_id', 'week', 'notes', 'somatiek', 'adl', 'mobiliteit', 'continentie', 'maatschappelijk', 'psychisch']]

In [None]:
# Split the dataset and push to Hugging Face hub

# Convert df to Hugging Face dataset
dataset = Dataset.from_pandas(df_sampc)

# Split the dataset into training(80%), validation(10%), and test(10%) sets
train_testvalid_split = dataset.train_test_split(test_size=0.2, seed=seed)
test_valid_split = train_testvalid_split['test'].train_test_split(test_size=0.5, seed=seed)

dataset_dict = DatasetDict({
    'train': train_testvalid_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test'],
})

# Push the dataset to Hugging Face Hub
dataset_dict.push_to_hub(path_hf_sampc,
                         commit_message=commit_message,
                         private=True)

