<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/work_in_progress/scripts/work_in_progress/421_CarePlan_Psych_TrainFietjeBase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
# Fine-tuning a Dutch Causal Language Model for Nursing Home Note Summarization

In this notebook, we fine-tune the Dutch Causal Language Model "BramVanroy/fietje-2" on a dataset of general nursing home notes
to automatically summarize key points regarding cognitive and behavioral issues from the notes.
The aim is to create a model that can read Dutch nursing home reports and generate summaries of a client's mental and behavioral state.

### Key Steps:
1. Load and preprocess the SAMPC dataset.
2. Truncate care notes to fit within the model's input limits.
3. Fine-tune the "fietje-2" model using PyTorch and Hugging Face's Trainer.
4. Generate outputs and compare with actual psychological assessments.
5. Push the fine-tuned model and tokenizer to the Hugging Face Hub.

### Dataset:
We use the "ekrombouts/Galaxy_SAMPC" dataset, which contains synthetic nursing care notes along with descriptions or summaries for the SAMPC categories:
somatic, ADL (Activities of Daily Living), social, psychological, and communication aspects.

### Expected Outcome:
After fine-tuning, the model will generate summaries of cognitive and behavioral aspects from nursing home notes.
While the output may not be fully reliable, this project serves to illustrate the potential of using language models for summarizing care data.
"""

In [None]:
# Install necessary libraries
!pip install -q transformers datasets


In [None]:
# Import required libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict
import time


In [None]:
# Load pre-trained model and tokenizer
path_hf_sampc = "ekrombouts/Galaxy_SAMPC"
model_name = "BramVanroy/fietje-2"
model_finetuned = "fietje_zorgplan_psyche"
commit_message = "Trained base model"

# Load the model with specified dtype and device map
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# Print memory footprint of the model
print(f"Memory footprint: {model.get_memory_footprint() / 1e9} GB")


In [None]:
# Load dataset
dataset = load_dataset(path_hf_sampc)
train_dataset = dataset['train']
val_dataset = dataset['validation']


In [None]:
# Define truncation function
def truncate_notes_to_fit_prompt(notes, max_length=1800):
    tokens = tokenizer(notes, return_tensors="np", truncation=True, max_length=max_length)
    truncated_notes = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
    return truncated_notes

# Add truncated notes to dataset
def add_truncated_notes(example):
    notes_text = example["notes"]
    truncated_notes = truncate_notes_to_fit_prompt(notes_text, max_length=1800)
    return {"truncated_notes": truncated_notes}

# Apply the truncation function to train and validation sets
train_dataset = train_dataset.map(add_truncated_notes)
val_dataset = val_dataset.map(add_truncated_notes)

In [None]:
# Sample generation with prompts and evaluation mode
sample = val_dataset[1]
notes = sample['truncated_notes']
gedrag_actual = sample['psychisch']

prompt = f'''Lees de volgende rapportages en beschrijf de cognitie en gedragsproblemen van de cliënt.

Rapportages:
{notes}

Geef de output als lijst van strings, voorbeeld: ["aap", "noot", "mies"]

Beschrijf cognitie/gedragsproblemen:
'''

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
attention_mask = tokenizer(prompt, return_tensors="pt", padding=True).attention_mask.to(model.device)

# Enable cache and set model to evaluation mode
model.config.use_cache = True
model.eval()

# Generate output
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_new_tokens=150,
    do_sample=True,
    top_p=0.95,
    top_k=50,
    temperature=0.7,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

# Decode generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
generated_response = generated_text[len(prompt):].strip()

# Display generated response and actual response
print("Generated response:")
print(generated_response)
print("\nActual:")
print(gedrag_actual)

In [None]:
# Function to tokenize dataset samples
def collate_and_tokenize(examples):
    notes = examples["truncated_notes"][0]
    psychisch = examples["psychisch"][0]

    prompt = f'''Lees de volgende rapportages en beschrijf de cognitie en gedragsproblemen van de cliënt.

Rapportages:
{notes}

Geef de output als lijst van strings, voorbeeld: ["aap", "noot", "mies"]

Beschrijf cognitie/gedragsproblemen:
{psychisch}
'''

    # Tokenize and create labels
    encoded = tokenizer(
        prompt,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=2048,
    )
    encoded["labels"] = encoded["input_ids"].clone()
    return encoded

In [None]:
# Tokenize the dataset and remove unnecessary columns
columns_to_remove = ['ct_id', 'week', 'notes', 'somatiek', 'adl', 'mobiliteit', 'continentie', 'maatschappelijk', 'psychisch', 'truncated_notes']

# Apply tokenization
tokenized_dataset_train = train_dataset.map(collate_and_tokenize, batched=True, batch_size=1, remove_columns=columns_to_remove)
tokenized_dataset_val = val_dataset.map(collate_and_tokenize, batched=True, batch_size=1, remove_columns=columns_to_remove)


In [None]:
# Print trainable parameters in the model
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}%"
    )

print_trainable_parameters(model)


In [None]:
# Enable gradient checkpointing and set model to training mode
model.gradient_checkpointing_enable()
model.train()


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_full',
    report_to='none',
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    warmup_steps=50,
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    bf16=True,
    learning_rate=5e-5,
    weight_decay=0.01,
)


In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    args=training_args,
)

# Disable cache for training
model.config.use_cache = False

In [None]:
# Train the model and measure training time
start_time = time.time()  # Start time
trainer.train()  # Start training
end_time = time.time()  # End time

training_time = end_time - start_time  # Total training time
print(f"Training completed in {training_time} seconds.")

In [None]:
# Push trained model and tokenizer to Hugging Face Hub
model.push_to_hub(model_finetuned, use_auth_token=True, commit_message=commit_message, private=True)
tokenizer.push_to_hub(model_finetuned, use_auth_token=True, commit_message=commit_message)

In [None]:
# Stop Colab runtime (if applicable)
from google.colab import runtime
runtime.unassign()