# 🧠 MedChatGuard - PeFT Fine-Tuning in Colab
Fine-tune a QA-model (`deepset/roberta-base-squad2`) on synthetic EHR data using QLoRA.


### Install Dependencies

In [None]:
!pip install transformers datasets evaluate accelerate

### Load SQuAD-style dataset from Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/FineTuning/ehr_clean_text.jsonl"

In [4]:
# Case where not from G-Drive
DATA_PATH = "../data/finetune/ehr_clean_text.jsonl"

### Load Dataset and Tokenizer

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-small"
dataset = load_dataset("json", data_files=DATA_PATH, split="train")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

Generating train split: 578 examples [00:00, 8075.48 examples/s]
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


### Preprocessing Function

In [7]:
dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 578
})

In [8]:
def preprocess_instruction(example):
    # Combine instruction + input
    input_text = f"{example['instruction']}\n\n{example['input']}"
    target_text = example["output"]

    return tokenizer(
        input_text,
        text_target=target_text,
        padding="max_length",
        max_length=512,
        truncation=True
    )

tokenized_dataset = dataset.map(preprocess_instruction, batched=False)
tokenized_dataset.set_format("torch")


Map: 100%|██████████| 578/578 [00:00<00:00, 789.63 examples/s] 


### Load Model and Tokenizer

In [10]:
from transformers import TrainingArguments, Trainer, default_data_collator

training_args = TrainingArguments(
    output_dir="./flan-small-checkpoints",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    # eval_dataset=tokenized["validation"] if "validation" in tokenized else None,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)


  trainer = Trainer(


### Train

In [None]:
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


### Save Model

In [None]:
# SAVE_PATH = "/content/drive/MyDrive/Colab Notebooks/FineTuning/roberta_qa_finetuned"
SAVE_PATH = "../models/finetuned_model/flan-t5-small-finetuned"

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

('../models/finetuned_model/roberta-base-squad2\\tokenizer_config.json',
 '../models/finetuned_model/roberta-base-squad2\\special_tokens_map.json',
 '../models/finetuned_model/roberta-base-squad2\\vocab.json',
 '../models/finetuned_model/roberta-base-squad2\\merges.txt',
 '../models/finetuned_model/roberta-base-squad2\\added_tokens.json',
 '../models/finetuned_model/roberta-base-squad2\\tokenizer.json')