<a href="https://colab.research.google.com/github/ekrombouts/gcai_zuster_fietje/blob/main/notebooks/300_GenCareAIQADatasetCreation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Creating a QA Dataset from GenCareAI Client Records

**Author:** Eva Rombouts  
**Date:** 2024-10-16  

### Description
This notebook creates a dataset from client records and predefined instruction prompts. Healthcare context notes are matched with relevant instructions using embeddings and cosine similarity. A language model generates responses to these instructions, and the final dataset is split into training, validation, and test sets. The dataset is then saved locally and can be uploaded to Hugging Face for further use.

## Environment Setup and Library Imports

In [None]:
# When in Colab
from google.colab import drive, userdata
import os

drive.mount('/content/drive')
base_dir = "/content/drive/My Drive/Colab Notebooks/GenCareAI"
open_ai_api_key = userdata.get("GCI_OPENAI_API_KEY")

!pip install -q datasets sentence-transformers langchain langchain_openai langchain_community

In [None]:
# # When running locally
# import os
# from pathlib import Path

# base_dir = Path(os.getcwd()).resolve().parents[0]
# open_ai_api_key = os.getenv("GCI_OPENAI_API_KEY")

In [None]:
# Import necessary libraries
import os
import random
import pandas as pd

# For data splitting and similarity calculations
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Importing libraries for working with LLM prompts and OpenAI
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback

# Torch for deep learning and sentence transformers for embeddings
import torch
from sentence_transformers import SentenceTransformer

# Progress bar utilities
from tqdm.autonotebook import tqdm, trange

# Hugging Face dataset utilities
from datasets import load_dataset, Dataset, DatasetDict

verbose = True

In [None]:
# Set parameters
seed = 6

# Data paths
nursing_care_home_name = "Gardenia"
# For reading data
path_hf_records = f"ekrombouts/{nursing_care_home_name}_records"
path_hf_clients = f"ekrombouts/{nursing_care_home_name}_clients"

# For writing data
path_hf_instruct = f"ekrombouts/{nursing_care_home_name}_instruct_dataset"
commit_message = "Instruct dataset"

# Function to generate file paths for embeddings
def get_embedding_path(gender, context_or_instruction):
    return os.path.join(base_dir, f'data/care_pal/{gender}_{context_or_instruction}_embeddings.pt')

# File paths for saving/loading the embeddings
fn_male_context_embeddings = get_embedding_path('male', 'context')
fn_female_context_embeddings = get_embedding_path('female', 'context')
fn_male_instruction_embeddings = get_embedding_path('male', 'instruction')
fn_female_instruction_embeddings = get_embedding_path('female', 'instruction')

# File path for saving the generated responses
fn_responses = os.path.join(base_dir, 'data/care_pal/context_instruction_pairs_with_responses.pkl')

# Additional parameters
num_general_prompts = 250
k_instructions = 2
k_contexts = 50
sep_line = 50 * '-'

# Set seed for reproducibility
random.seed(seed)

## Loading and Preprocessing Data
Client records and notes from fictional clients of a nursing home are loaded, cleaned, and processed. Client genders are identified, and the notes are grouped by week.

In [None]:
# Load datasets from Hugging Face and preprocess
def load_and_preprocess_data():
    dataset_records = load_dataset(path_hf_records)
    dataset_clients = load_dataset(path_hf_clients)

    df_records = dataset_records['train'].to_pandas()
    df_clients = dataset_clients['train'].to_pandas()

    def determine_client_gender(row):
        name = row["name"]
        if "Mevrouw" in name:
            return "female"
        elif "Meneer" in name:
            return "male"
        else:
            return "unknown"

    df = (df_records
          .dropna()
          .assign(week=lambda df: pd.to_datetime(df['datetime']).dt.to_period('W').dt.to_timestamp())  # Add 'week' column
          .groupby(['client_id', 'week'])
          .agg({'note': lambda x: '\n'.join(x)})  # Concatenate 'note' values
          .reset_index()
          .rename(columns={'note': 'weeknotes'})
          .merge(df_clients[['client_id', 'name']], on='client_id', how='left')  # Merge with client name
          .assign(gender=lambda df: df.apply(determine_client_gender, axis=1))  # Determine gender
         )
    return df, df_records

df, df_records = load_and_preprocess_data()

if verbose:
  print(f"Rows in original df: {df_records.shape[0]}, rows in processed df: {df.shape[0]}\n")
  print(f"SAMPLES{sep_line}\n{df.sample(3)}\n")
  print(f"\nContext column (weeknotes) example:{sep_line}\n{df['weeknotes'].iloc[0]}")
  print(f"\nPercentage gender:{sep_line}\n{df['gender'].value_counts(normalize=True)}")


## Instruction design
A list of question prompts is created for male, female, and general contexts. These prompts are relevant to nursing home care, focusing on conditions, care requirements, and observations.

In [None]:
# Define instruction prompts for male, female, and general contexts
instructions_male = [
    "Beschrijf lichamelijke klachten",
    "Hoe voelt de patiënt zich?",
    "Welke ziektes heeft cliënt?",
    "Beschrijf de klachten van meneer",
    "Heeft deze cliënt pijn?",
    "Welke ongemakken ervaart dhr?",
    "Welke behandeling is ingezet?",
    "Beschrijf of er wonden of huidproblemen zijn",
    "Beschrijf de benodigde ADL hulp",
    "Beschrijf bijzonderheden over eten en drinken",
    "Welke hulp heeft dhr nodig bij wassen en aankleden?",
    "Geef aan welke hulp wordt geboden bij eten en drinken",
    "Wordt meneer geholpen bij douchen?",
    "Hoe wordt de ADL gedaan?",
    "Beschrijf de mobiliteit van meneer",
    "Welk loophulpmiddel gebruikt cliënt?",
    "Beschrijf de mate van valgevaar",
    "Welke hulp wordt geboden bij de mobiliteit?",
    "Beschrijf de daginvulling van meneer",
    "Doet ct mee aan activiteiten?",
    "Hoe verlopen de nachten?",
    "Heeft meneer lekker geslapen?",
    "Geef aan of er stemmingsklachten zijn",
    "Beschrijf gedragsproblemen",
    "Hoe is de cognitie van meneer?",
]

instructions_female = [
    "Beschrijf lichamelijke klachten",
    "Hoe voelt de patiënt zich?",
    "Welke ziektes heeft cliënte?",
    "Beschrijf de klachten van mevrouw",
    "Heeft deze cliënte pijn?",
    "Welke ongemakken ervaart mw?",
    "Welke behandeling is ingezet?",
    "Beschrijf of er wonden of huidproblemen zijn",
    "Beschrijf de benodigde ADL hulp",
    "Beschrijf bijzonderheden over eten en drinken",
    "Welke hulp heeft mw nodig bij wassen en aankleden?",
    "Geef aan welke hulp wordt geboden bij eten en drinken",
    "Wordt mevrouw geholpen bij douchen?",
    "Hoe wordt de ADL gedaan?",
    "Beschrijf de mobiliteit van mevrouw",
    "Welk loophulpmiddel gebruikt cliënte?",
    "Beschrijf de mate van valgevaar",
    "Welke hulp wordt geboden bij de mobiliteit?",
    "Beschrijf de daginvulling van mevrouw",
    "Doet cte mee aan activiteiten?",
    "Hoe verlopen de nachten?",
    "Heeft mevrouw lekker geslapen?",
    "Geef aan of er stemmingsklachten zijn",
    "Beschrijf gedragsproblemen",
    "Hoe is de cognitie van mevrouw?",
]

instructions_general = [
    "Geef twee belangrijke punten waarop moet worden geobserveerd en gerapporteerd",
    "Noem de aandachtspunten voor het zorgpersoneel",
    "Welke acties moet het zorgteam nemen op basis van deze rapportages?",
    "Vat de rapportages kort en bondig samen",
]

# Combine all instruction prompts
instructions = instructions_male + instructions_female + instructions_general

if verbose:
    print(f"Number of male instructions: {len(instructions_male)}")
    print(f"Number of female instructions: {len(instructions_female)}")
    print(f"Number of general instructions: {len(instructions_general)}")
    print(f"Total number of instructions: {len(instructions)}\n")

In [None]:
# Create lists of contexts
male_contexts = df[df['gender'] == 'male']['weeknotes'].tolist()
female_contexts = df[df['gender'] == 'female']['weeknotes'].tolist()
contexts = male_contexts + female_contexts

if verbose:
    print(f"Number of male contexts: {len(male_contexts)}")
    print(f"Number of female contexts: {len(female_contexts)}")
    print(f"Total number of contexts: {len(contexts)}")


## Embedding generation
Contexts (notes) and instructions are converted into embeddings using a sentence transformer model. These embeddings are used to represent the semantic meaning of the text and match the instructions to contexts.

In [None]:
# Load the embeddings model
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# Function to load or generate embeddings
def load_or_generate_embeddings(file_path, data, model):
    if os.path.exists(file_path):
        print(f"Loading embeddings from {file_path}")
        embeddings = torch.load(file_path, weights_only=True)
    else:
        print(f"Generating embeddings for {file_path}")
        embeddings = model.encode(
            sentences=data,
            convert_to_tensor=True,
            show_progress_bar=True
        )
        torch.save(embeddings, file_path)
    return embeddings

# Load or generate embeddings for male and female contexts and instructions
male_context_embeddings = load_or_generate_embeddings(fn_male_context_embeddings, male_contexts, model)
female_context_embeddings = load_or_generate_embeddings(fn_female_context_embeddings, female_contexts, model)
male_instruction_embeddings = load_or_generate_embeddings(fn_male_instruction_embeddings, instructions_male, model)
female_instruction_embeddings = load_or_generate_embeddings(fn_female_instruction_embeddings, instructions_female, model)

if verbose:
    print(f"\nLength of male_context_embeddings: {len(male_context_embeddings)}")
    print(f"\nShape of the first embedding: {male_context_embeddings[0].shape}")
    print(f"\nFirst embedding:\n{male_context_embeddings[0][:20]}...")  # Only shows the first values


## Matching Instructions to Contexts
Using cosine similarity, instructions are paired with the corresponding client notes. This helps in determining which instructions fit which contexts best.

In [None]:
# Function to calculate top-k similarities
def get_top_k_indices(cosine_sim_matrix, k, dim):
    return torch.topk(torch.tensor(cosine_sim_matrix), k=k, dim=dim)

def process_context_instruction_pairs(cosine_sim_matrix, contexts, instructions, k_instructions=2, k_contexts=50):
    context_instruction_pairs = []

    # Top K instructions per context
    top_k_instructions_for_contexts = get_top_k_indices(cosine_sim_matrix, k=k_instructions, dim=1)

    # Top K contexts per instruction
    top_k_contexts_for_instructions = get_top_k_indices(cosine_sim_matrix, k=k_contexts, dim=0)

    # Least fitting context per instruction
    worst_contexts_for_instructions = torch.argmin(torch.tensor(cosine_sim_matrix), dim=0)

    # Add top K instructions for each context
    for i, top_instruction_indices in enumerate(top_k_instructions_for_contexts.indices):
        for idx in top_instruction_indices:
            context_instruction_pairs.append({
                "context": contexts[i],
                "instruction": instructions[idx.item()],
                "similarity": cosine_sim_matrix[i, idx.item()],
                "relationship_type": "top instructions for context"
            })

    # Add top K contexts for each instruction
    for j, top_context_indices in enumerate(top_k_contexts_for_instructions.indices.T):
        for idx in top_context_indices:
            context_instruction_pairs.append({
                "context": contexts[idx.item()],
                "instruction": instructions[j],
                "similarity": cosine_sim_matrix[idx.item(), j],
                "relationship_type": "top contexts for instruction"
            })

    # Add least fitting context for each instruction
    for j, worst_context_idx in enumerate(worst_contexts_for_instructions):
        context_instruction_pairs.append({
            "context": contexts[worst_context_idx.item()],
            "instruction": instructions[j],
            "similarity": cosine_sim_matrix[worst_context_idx.item(), j],
            "relationship_type": "worst context for instruction"
        })

    return context_instruction_pairs

In [None]:
# Process male and female datasets
male_cosine_sim_matrix = cosine_similarity(male_context_embeddings, male_instruction_embeddings)
female_cosine_sim_matrix = cosine_similarity(female_context_embeddings, female_instruction_embeddings)

context_instruction_pairs_male = process_context_instruction_pairs(
    cosine_sim_matrix=male_cosine_sim_matrix,
    contexts=male_contexts, instructions=instructions_male,
    k_instructions=k_instructions,
    k_contexts=k_contexts
)

context_instruction_pairs_female = process_context_instruction_pairs(
    cosine_sim_matrix=female_cosine_sim_matrix,
    contexts=female_contexts,
    instructions=instructions_female,
    k_instructions=k_instructions,
    k_contexts=k_contexts)

# Combine male and female pairs
context_instruction_pairs = context_instruction_pairs_male + context_instruction_pairs_female

# Add general instructions to context-instruction pairs
random.seed(seed)
for instruction in instructions_general:
    sampled_contexts = random.sample(contexts, num_general_prompts)
    for context in sampled_contexts:
        context_instruction_pairs.append({
            "context": context,
            "instruction": instruction,
            "similarity": 0.0,
            "relationship_type": "general"
        })

if verbose:
    print(f"Cosine similarity matrix shape - male: {male_cosine_sim_matrix.shape}")
    print(f"Cosine similarity matrix shape - female: {female_cosine_sim_matrix.shape}")

In [None]:
# Convert context_instruction_pairs into a DataFrame
df_context_instruction_pairs = pd.DataFrame(context_instruction_pairs)

if verbose:
    print(f"SAMPLES\n{df_context_instruction_pairs.sample(3)}\n")
    print("INFO")
    print(df_context_instruction_pairs.info())
    print(f"\nVALUE COUNTS\n{df_context_instruction_pairs['instruction'].value_counts()}\n")


In [None]:
# Remove duplicates of columns 'context', 'instruction' en 'similarity'
df_context_instruction_pairs = df_context_instruction_pairs.drop_duplicates(subset=['context', 'instruction', 'similarity'])

if verbose:
    print(f"Num rows after dropping duplicates: {df_context_instruction_pairs.shape[0]}\n")
    print("Info")
    print(df_context_instruction_pairs.info())


## LLM Response Generation
An llm is used to generate answers to the paired instructions based on the context provided. The generated responses are stored in the dataset.

In [None]:
template = """Lees onderstaande rapportages, die een periode van een client in het verpleeghuis beschrijven, en beantwoord onderstaande instructie.
Baseer je uitsluitend op de informatie die in de rapportages staat. Als er geen relevante informatie in staat, zeg dat dan. Hou je antwoord kort en bondig.

RAPPORTAGES:
{context}

INSTRUCTIE:
{instruction}
"""

prompt_template = PromptTemplate(
    input_variables=["context", "instruction"],
    template=template,
)

# Initialize the language model with specified parameters
llm = ChatOpenAI(
    api_key=setup.get_openai_key(),
    model="gpt-4o-mini-2024-07-18",
    temperature=0.3,
    presence_penalty=0.2,
)

chain = prompt_template | llm

if verbose:
    sample_id = 50
    sample_context = df_context_instruction_pairs['context'].iloc[sample_id]
    sample_instruction = df_context_instruction_pairs['instruction'].iloc[sample_id]

    sample_prompt = template.format(
            context=sample_context,
            instruction=sample_instruction
    )

    result = chain.invoke({"context": sample_context, "instruction": sample_instruction})

    print(sample_prompt)
    print("RELATIONSHIP TYPE")
    print(df_context_instruction_pairs['relationship_type'].iloc[sample_id])
    print("RESPONSE")
    print(result.content)


In [None]:
# Load the previously saved dataframe if it exists, otherwise start fresh
if os.path.exists(fn_responses):
    df_context_instruction_pairs = pd.read_pickle(fn_responses)
else:
    df_context_instruction_pairs['llm_response'] = None  # Ensure the column exists

# Function to add LLM answer to the dataframe with error handling
def get_llm_answer(row, cb):
    try:
        if pd.isna(row['llm_response']):  # Process only new prompts
            result = chain.invoke({"context": row['context'], "instruction": row['instruction']}, callbacks=[cb])
            return result.content
        else:
            return row['llm_response']  # Keep existing answers
    except Exception as e:
        print(f"Error processing row {row.name}: {e}")
        return None

# Create a callback instance to track cost
with get_openai_callback() as cb:

    # Iterate over the rows and save progress intermittently
    for idx, row in df_context_instruction_pairs.iterrows():
        df_context_instruction_pairs.at[idx, 'llm_response'] = get_llm_answer(row, cb)

        # Save progress every 10 iterations
        if idx % 100 == 0:
            df_context_instruction_pairs.to_pickle(fn_responses)
            print(f"Checkpoint saved at index {idx}, total cost so far: ${cb.total_cost:.4f}")

    # Save the final result
    df_context_instruction_pairs.to_pickle(fn_responses)
    print("Processing complete and final dataframe saved.")
    print(f"Total cost: ${cb.total_cost:.4f}")

In [None]:
example_ct = 63
print(df_context_instruction_pairs['prompt'].iloc[example_ct])
print("\nRESPONSE:")
print(df_context_instruction_pairs['llm_response'].iloc[example_ct])

## Dataset Creation, Splitting, and Saving
The context-instruction-response pairs are compiled into a single dataset. This dataset is then split into training, validation, and test sets and finally, the dataset is saved locally and prepared for uploading to Hugging Face, allowing it to be shared or reused in future projects.

In [None]:
df = (df_context_instruction_pairs
      .loc[:, ['context', 'instruction', 'llm_response']]
      .rename(columns={'llm_response': 'response'})
     )

# Convert df to Hugging Face dataset
dataset = Dataset.from_pandas(
    df=df,
    preserve_index=False
)

# Split the dataset into training and test/validation splits
train_testvalid_split = dataset.train_test_split(test_size=0.2, seed=seed)

# Further split the test set into validation and test sets
test_valid_split = train_testvalid_split['test'].train_test_split(test_size=0.5, seed=seed)

# Create a DatasetDict object to hold the splits
dataset_dict = DatasetDict({
    'train': train_testvalid_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test'],
})

# # Push the dataset to HuggingFace Hub with the specified path and commit message
# dataset_dict.push_to_hub(path_hf_instruct,
#                          commit_message=commit_message,
#                          private=True)