# Fine-Tuning LLaMA-2 (7B) for Automated Chest X-Ray Impression Generation using LoRA (QLoRA)

1.Hugging Face Authentication

In [None]:
# Authenticate to access Hugging Face models
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

2.Installing Dependencies

In [None]:
# ✅ Install dependencies
!pip install -U transformers accelerate peft datasets sentencepiece
!pip install -U bitsandbytes

Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.2-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.2 pya

3.Environment Check

In [None]:
# Verify PyTorch version, GPU availability, and bitsandbytes installation
import torch, bitsandbytes as bnb
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("bitsandbytes:", bnb.__version__)

PyTorch: 2.9.0+cu126
CUDA available: True
bitsandbytes: 0.49.1


4.Imports and Global Configuration

In [None]:
# Dataset and model utilities

from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          DataCollatorForLanguageModeling, TrainingArguments, Trainer)
# PEFT utilities for LoRA / QLoRA
from peft import (prepare_model_for_kbit_training, LoraConfig, get_peft_model)

# Base model name and maximum sequence length
model_name = "meta-llama/Llama-2-7b-chat-hf"
# training lengths
MAX_LEN = 512

# where the files are (adjust this!)
data_dir = "/content/drive/MyDrive/datasets/radiology_data"

5.Mount Google Drive

In [None]:
#conncet to Gogle colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Confirm that dataset files are available
!ls "/content/drive/MyDrive/Datasets/radiology_data"

test.jsonl  train.jsonl  validate.jsonl


6.Load Radiology Dataset (JSONL)

In [None]:
#Load Data set
from datasets import load_dataset

data_dir = "/content/drive/MyDrive/Datasets/radiology_data"

# Define dataset splits
data_files = {
    "train":      f"{data_dir}/train.jsonl",
    "validation": f"{data_dir}/validate.jsonl",  # file is 'validate', key can be 'validation'
    "test":       f"{data_dir}/test.jsonl",
}
# Load dataset using Hugging Face Datasets
raw = load_dataset("json", data_files=data_files)
raw


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'inputs', 'target'],
        num_rows: 2735
    })
    validation: Dataset({
        features: ['idx', 'inputs', 'target'],
        num_rows: 341
    })
    test: Dataset({
        features: ['idx', 'inputs', 'target'],
        num_rows: 343
    })
})

7.Instruction Prompt Construction

In [None]:
# Build Prompt Format(Instruction + Input + Target)
# [INST] <findings> [/INST] <impression>

def build_prompt(inp, tgt):
    return f"[INST] {inp.strip()} [/INST] {tgt.strip()}"



8.Apply Prompt Formatting

In [None]:
# Convert each dataset example into a single instruction-formatted text field
def to_text(example):
    example["text"] = build_prompt(example["inputs"], example["target"])
    return example

train_ds = raw["train"].map(to_text)
val_ds   = raw["validation"].map(to_text)
test_ds  = raw["test"].map(to_text)

print(train_ds[0])

Map:   0%|          | 0/2735 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

{'idx': 0, 'inputs': 'The lungs are clear, and without focal air space opacity. The cardiomediastinal silhouette is normal in size and contour, and stable. There is no pneumothorax or large pleural effusion.', 'target': 'No acute cardiopulmonary abnormality.', 'text': '[INST] The lungs are clear, and without focal air space opacity. The cardiomediastinal silhouette is normal in size and contour, and stable. There is no pneumothorax or large pleural effusion. [/INST] No acute cardiopulmonary abnormality.'}


9.Tokenization

In [None]:
# Load tokenizer for LLaMA-2
from transformers import AutoTokenizer

# LLaMA does not define a padding token; reuse EOS token for batching
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Tokenize text with truncation to control sequence length
def tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length=512)

train_token = train_ds.map(tokenize, batched=True)
val_token   = val_ds.map(tokenize, batched=True)

Map:   0%|          | 0/2735 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

10.LoRA Configuration

In [None]:
# Configure LoRA adapters for parameter-efficient fine-tuning
from peft import LoraConfig, get_peft_model

lora = LoraConfig(
    r=16,# Rank of LoRA matrices
    lora_alpha=32,# Scaling factor
    lora_dropout=0.1,# Regularization
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj",
                    "gate_proj","up_proj","down_proj"]
)
# Attach LoRA adapters to the base model
model = get_peft_model(model, lora)
# Print number of trainable vs frozen parameters
model.print_trainable_parameters()



trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898


11.Training Configuration

In [None]:

  # Training arguments optimized for limited GPU memory
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./radiology_llama_lora",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Simulate larger batch size
    logging_steps=50,
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,                      # Mixed precision for speed and memory efficiency
    report_to="none",
)
# Causal language modeling data collator

collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
# Model Training
# # Initialize Trainer and start fine-tuning LoRA adapters
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_token,
    eval_dataset=val_token,
    data_collator=collator,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
50,1.3992
100,1.0046
150,0.909
200,0.8997
250,0.8325
300,0.8478
350,0.7879
400,0.6823
450,0.6691
500,0.6248


  return fn(*args, **kwargs)


Step,Training Loss
50,1.3992
100,1.0046
150,0.909
200,0.8997
250,0.8325
300,0.8478
350,0.7879
400,0.6823
450,0.6691
500,0.6248


TrainOutput(global_step=684, training_loss=0.7997248465554756, metrics={'train_runtime': 4167.7715, 'train_samples_per_second': 1.312, 'train_steps_per_second': 0.164, 'total_flos': 1.8518664035549184e+16, 'train_loss': 0.7997248465554756, 'epoch': 2.0})

12.Save LoRA Adapters

In [None]:
#Save LoRA adapter (and tokenizer)
save_dir = "/content/drive/MyDrive/radiology_llama_lora_final"

model.save_pretrained(save_dir, safe_serialization=True)
tokenizer.save_pretrained(save_dir)

print("Saved to:", save_dir)


Saved to: /content/drive/MyDrive/radiology_llama_lora_final


13.Merge LoRA Adapters into Base Model

In [None]:
#Create a single “merged” model for easy inference anywhere
# Merge LoRA into the base weights and save a full model
merged_dir = "/content/drive/MyDrive/radiology_llama_merged"

merged = model.merge_and_unload()            # turns your PEFT model into a plain HF model
merged.save_pretrained(merged_dir, safe_serialization=True)
tokenizer.save_pretrained(merged_dir)

print("Merged model saved to:", merged_dir)



Merged model saved to: /content/drive/MyDrive/radiology_llama_merged


14.Inference on Fine-Tuned Model

In [None]:
# ===============================
# Inference on Fine-Tuned Model
# ===============================

# Load tokenizer and merged (LoRA + base) model for inference
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Path to the merged model (base model + LoRA adapters)
model_path = "/content/drive/MyDrive/radiology_llama_merged"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model in half precision for faster and memory-efficient inference
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Set model to evaluation mode
model.eval()

# --------------------------------
# Inference helper function
# --------------------------------
def generate_impression(text, max_new_tokens=150, temperature=0.7):
    """
    Generates a radiology impression from input findings
    using the instruction-tuned LLaMA-2 model.
    """
# Format input using LLaMA-2 instruction style
    prompt = f"[INST] {text.strip()} [/INST]"

    # Tokenize input and move tensors to the same device as the model
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate model output
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,          # Enable sampling for natural language generation
        temperature=temperature # Controls creativity vs determinism
    )

    # Decode generated tokens into readable text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example inference on test input
print(
    generate_impression(
        "There is patchy consolidation in the right middle zone. "
        "Provide a radiology impression."
    )
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Loading adapter weights from /content/drive/MyDrive/radiology_llama_merged led to missing keys in the model: model.layers.0.self_attn.q_proj.lora_A.default.weight, model.layers.0.self_attn.q_proj.lora_B.default.weight, model.layers.0.self_attn.k_proj.lora_A.default.weight, model.layers.0.self_attn.k_proj.lora_B.default.weight, model.layers.0.self_attn.v_proj.lora_A.default.weight, model.layers.0.self_attn.v_proj.lora_B.default.weight, model.layers.0.self_attn.o_proj.lora_A.default.weight, model.layers.0.self_attn.o_proj.lora_B.default.weight, model.layers.0.mlp.gate_proj.lora_A.default.weight, model.layers.0.mlp.gate_proj.lora_B.default.weight, model.layers.0.mlp.up_proj.lora_A.default.weight, model.layers.0.mlp.up_proj.lora_B.default.weight, model.layers.0.mlp.down_proj.lora_A.default.weight, model.layers.0.mlp.down_proj.lora_B.default.weight, model.layers.1.self_attn.q_proj.lora_A.default.weight, model.layers.1.self_attn.q_proj.lora_B.default.weight, model.layers.1.self_attn.k_proj.l

[INST] There is patchy consolidation in the right middle zone. Provide a radiology impression. [/INST]  Based on the information provided, here is a possible radiology impression for the patchy consolidation in the right middle zone:

Impression:

* Patchy consolidation in the right middle zone, likely representing an infectious process such as pneumonia.
* The patchy consolidation may represent areas of inflammation and/or granuloma formation in the lung tissue.
* The location in the right middle zone is consistent with a possible bacterial or viral infection that has spread to the lung.
* Further imaging and laboratory studies are necessary to confirm the diagnosis and determine the appropriate course of treatment.

It is important to note that this


In [None]:
#Install evaluation libraries
!pip -q install rouge-score bert-score nltk
import nltk
nltk.download("punkt", quiet=True)

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


True

15.Load and sanity-check the test split

In [None]:
# ===============================
# Load and sanity-check the test split
# ===============================

# Load the held-out test set from JSON to evaluate generalization
# (test data is never used during training or validation)
from datasets import load_dataset

DATA_DIR = "/content/drive/MyDrive/Datasets/radiology_data"

# Load only the test split for final evaluation
test_ds = load_dataset(
    "json",
    data_files={"test": f"{DATA_DIR}/test.jsonl"}  # (or test.json depending on your file)
)["test"]

# Basic sanity checks: dataset size + inspect one sample schema/content
print("Test samples:", len(test_ds))
test_ds[0]

Generating test split: 0 examples [00:00, ? examples/s]

Test samples: 343


{'idx': 3076,
 'inputs': 'Heart size within normal limits. No focal alveolar consolidation, no definite pleural effusion seen. No typical findings of pulmonary edema. Mediastinal calcification and dense right upper lung nodule suggest a previous granulomatous process.',
 'target': 'No acute cardiopulmonary findings'}

16.Load Merged Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Path to the merged model checkpoint
MODEL_PATH = "/content/drive/MyDrive/radiology_llama_merged"

# Use GPU for inference if available
device = "cuda"

# Load tokenizer from the merged model directory
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Load model in half precision for faster and memory-efficient inference
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16
).to(device)

# Set model to evaluation mode to disable dropout
model.eval()

# LLaMA models do not define a padding token by default
# Reuse EOS token to enable proper batching
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading adapter weights from /content/drive/MyDrive/radiology_llama_merged led to missing keys in the model: model.layers.0.self_attn.q_proj.lora_A.default.weight, model.layers.0.self_attn.q_proj.lora_B.default.weight, model.layers.0.self_attn.k_proj.lora_A.default.weight, model.layers.0.self_attn.k_proj.lora_B.default.weight, model.layers.0.self_attn.v_proj.lora_A.default.weight, model.layers.0.self_attn.v_proj.lora_B.default.weight, model.layers.0.self_attn.o_proj.lora_A.default.weight, model.layers.0.self_attn.o_proj.lora_B.default.weight, model.layers.0.mlp.gate_proj.lora_A.default.weight, model.layers.0.mlp.gate_proj.lora_B.default.weight, model.layers.0.mlp.up_proj.lora_A.default.weight, model.layers.0.mlp.up_proj.lora_B.default.weight, model.layers.0.mlp.down_proj.lora_A.default.weight, model.layers.0.mlp.down_proj.lora_B.default.weight, model.layers.1.self_attn.q_proj.lora_A.default.weight, model.layers.1.self_attn.q_proj.lora_B.default.weight, model.layers.1.self_attn.k_proj.l

17.Prompt Formatting Helper

In [None]:
# Formats input text using LLaMA-2 instruction style
# This ensures consistency between training and inference
def format_prompt(x):
    return f"[INST] {x.strip()} [/INST]"

18.Batch Inference on Test Set

In [None]:


# Batch inference over the FULL test set with CLEANED model outputs

from tqdm.auto import tqdm
import torch
import re

# Lists to store model predictions and ground-truth references
preds, refs = [], []

def clean_text(text):
    """
    Post-process generated text by removing leading section headers
    such as 'Impression:' to better match reference formatting.
    """
    text = re.sub(r"^\s*(impression\s*:?)\s*", "", text, flags=re.I)
    return text.strip()

@torch.no_grad()
def generate_clean(text):
    """
    Generate model output using controlled decoding settings
    and apply text cleaning for evaluation.
    """
    enc = tokenizer(text, return_tensors="pt").to(device)

    out = model.generate(
        **enc,
        max_new_tokens=120,          # Limit output length
        do_sample=True,              # Enable sampling for more natural text
        temperature=0.7,             # Control randomness
        top_p=0.9,                   # Nucleus sampling
        repetition_penalty=1.15,     # Reduce repetitive phrases
        no_repeat_ngram_size=3,       # Prevent repeated n-grams
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    return clean_text(decoded)

# Run inference on the FULL test dataset
for ex in tqdm(test_ds):
    prompt = format_prompt(ex["inputs"])
    pred = generate_clean(prompt)

    preds.append(pred)
    refs.append(ex["target"])

# Final sanity checks and sample outputs
print("Done.")
print("Total samples:", len(preds))
print("Sample PRED:", preds[0])
print("Sample REF :", refs[0])



  0%|          | 0/343 [00:00<?, ?it/s]

Done.
Total samples: 343
Sample PRED: [INST] Heart size within normal limits. No focal alveolar consolidation, no definite pleural effusion seen. No typical findings of pulmonary edema. Mediastinal calcification and dense right upper lung nodule suggest a previous granulomatous process. [/INST]  Based on the imaging findings you provided, here is a possible diagnosis for the patient:

Diagnosis: Previous granuloma (possibly tuberculosis) with residual mediastinal calcifications and a right upper lobe nodule.

Rationale:
The absence of any focal areas of alveolitis or pleural fluid suggests that the patient's respiratory symptoms are not due to an acute exacerbation of chronic bronchitis or pneumonia. The presence of mediastrial calcifications
Sample REF : No acute cardiopulmonary findings


In [None]:
# Compute ROUGE-L
from rouge_score import rouge_scorer
import numpy as np

rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

rougeL = np.mean([
    rouge.score(r, p)["rougeL"].fmeasure
    for r, p in zip(refs, preds)
])

print("ROUGE-L F1:", round(rougeL, 4))


ROUGE-L F1: 0.0591


In [None]:
#Compute BLEU-4
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smooth = SmoothingFunction().method1
bleu_scores = []

for r, p in zip(refs, preds):
    bleu_scores.append(sentence_bleu([r.split()], p.split(), smoothing_function=smooth))

print("BLEU-4:", round(float(np.mean(bleu_scores)), 4))

BLEU-4: 0.007


In [None]:
#Compute Bertscore
from bert_score import score as bert_score

P, R, F1 = bert_score(
    preds,
    refs,
    lang="en",
    model_type="microsoft/deberta-xlarge-mnli",
    rescale_with_baseline=False
)

print("BERTScore Precision:", round(float(P.mean()), 4))
print("BERTScore Recall:   ", round(float(R.mean()), 4))
print("BERTScore F1:       ", round(float(F1.mean()), 4))


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

BERTScore Precision: 0.4409
BERTScore Recall:    0.6318
BERTScore F1:        0.518


This notebook demonstrates an end-to-end pipeline for instruction-tuned radiology text generation using LLaMA-2 and QLoRA, optimized for limited GPU resources and real-world clinical NLP tasks.