In [1]:
import pandas as pd
from itertools import islice
import torch
from torch.utils.data import DataLoader
import sys
sys.path.append(r"C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\Chest-X-ray-Diagnosis-Automated-Reporting-using-CNNs-and-LLMs---UDEM-PEF-Thesis-Fall-2025")

from utils.text_metrics import evaluate_all_metrics
from utils.temp_utils import *
from utils.lstm_models import DinoLSTMAttnCaptioner, DinoBiLSTMAttnCaptioner
from utils.chexpert_dataset import CheXpertDataset
from utils.padchest_dataset import PadChestGRDataset

# Data

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

df = pd.read_csv(r"C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\PadChest-GR\master_table.csv")   # must contain column 'ImageID'
root_dir = r'C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\PadChest-GR\PadChest_GR'
json_file = r"C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\PadChest-GR\grounded_reports_20240819.json"

df_train = df[df['split'] == 'train']
df_validation = df[df['split'] == 'validation']
df_test = df[df['split'] == 'test']


IMG_SIZE = 224
MAX_LEN = 64
NUM_BATCH = 8

tf = dino_image_transform(img_size=IMG_SIZE)

ds_train = PadChestGRDataset(
        dataframe=df_train,
        root_dir=root_dir,
        json_file=json_file,
        max_txt_len=MAX_LEN,
        image_size=IMG_SIZE,
        normalize=True,
        transform=None,
        return_paths=False,
        sentence_key="sentence_en",
    )

ds_valid = PadChestGRDataset(
        dataframe=df_validation,
        root_dir=root_dir,
        json_file=json_file,
        max_txt_len=MAX_LEN,
        image_size=IMG_SIZE,
        normalize=True,
        transform=None,
        return_paths=False,
        sentence_key="sentence_en",
    )

ds_test = PadChestGRDataset(
        dataframe=df_test,
        root_dir=root_dir,
        json_file=json_file,
        max_txt_len=MAX_LEN,
        image_size=IMG_SIZE,
        normalize=True,
        transform=None,
        return_paths=False,
        sentence_key="sentence_en",
    )

tokenizer = build_tokenizer_from_labels()
pad_id = tokenizer.pad_token_id
eos_id = tokenizer.eos_token_id
bos_id = tokenizer.bos_token_id
collate_fn = CaptionCollate(tokenizer, pad_id)

train_loader = DataLoader(ds_train, batch_size=NUM_BATCH, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(ds_valid, batch_size=NUM_BATCH, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(ds_test, batch_size=NUM_BATCH, shuffle=False, collate_fn=collate_fn)

Using device: cuda


# Model

In [3]:
# DINO ViT-S/16 hidden size is 384 
EMBEDDING_D_IMG = 384
N_PREFIX = (IMG_SIZE // 16) ** 2  # number of visual prefix tokens (including CLS)

model = DinoBiLSTMAttnCaptioner(
    vocab_size=tokenizer.vocab_size,
    d_img=EMBEDDING_D_IMG,
    d_h=512,
    pad_id=pad_id,
    dino_model_id="facebook/dinov3-vits16-pretrain-lvd1689m",
    freeze_dino=True,
).to(device)

# Train Parameters

In [4]:
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=3e-4, weight_decay=1e-2
)
loss = sequence_ce_loss
NUM_EPOCHS = 10
BATCHES_PER_EPOCH = 10

# Training

In [5]:
for epoch in range(NUM_EPOCHS):
    slice_train_loader = islice(train_loader, BATCHES_PER_EPOCH)
    slice_valid_loader = islice(valid_loader, BATCHES_PER_EPOCH)
    train_stats = train_one_epoch(model, slice_train_loader, optimizer, device, pad_id, num_batches=BATCHES_PER_EPOCH, loss_fn=loss, grad_clip=1.0)
    val_stats = evaluate(model, slice_valid_loader, device, pad_id, num_batches=BATCHES_PER_EPOCH, loss_fn=loss)
    print(f"Epoch {epoch + 1}: Train Loss={train_stats['loss']:.4f}, PPL={train_stats['ppl']:.2f} | "
            f"Val Loss={val_stats['val_loss']:.4f}, Val PPL={val_stats['val_ppl']:.2f}")

Training: 100%|██████████| 10/10 [00:11<00:00,  1.15s/it]
Evaluating: 100%|██████████| 10/10 [00:08<00:00,  1.22it/s]


Epoch 1: Train Loss=9.2569, PPL=19195.13 | Val Loss=7.1982, Val PPL=1392.66


Training: 100%|██████████| 10/10 [00:08<00:00,  1.12it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.56it/s]


Epoch 2: Train Loss=6.3692, PPL=720.12 | Val Loss=5.1463, Val PPL=190.55


Training: 100%|██████████| 10/10 [00:10<00:00,  1.07s/it]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.56it/s]


Epoch 3: Train Loss=4.5003, PPL=96.32 | Val Loss=4.1610, Val PPL=70.86


Training: 100%|██████████| 10/10 [00:09<00:00,  1.07it/s]
Evaluating: 100%|██████████| 10/10 [00:07<00:00,  1.43it/s]


Epoch 4: Train Loss=3.9423, PPL=54.31 | Val Loss=3.5502, Val PPL=40.91


Training: 100%|██████████| 10/10 [00:09<00:00,  1.09it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.49it/s]


Epoch 5: Train Loss=3.1957, PPL=26.76 | Val Loss=3.2326, Val PPL=29.99


Training: 100%|██████████| 10/10 [00:09<00:00,  1.06it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.46it/s]


Epoch 6: Train Loss=3.0092, PPL=22.35 | Val Loss=2.9430, Val PPL=19.99


Training: 100%|██████████| 10/10 [00:11<00:00,  1.19s/it]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.56it/s]


Epoch 7: Train Loss=2.8387, PPL=18.06 | Val Loss=2.8717, Val PPL=19.68


Training: 100%|██████████| 10/10 [00:10<00:00,  1.03s/it]
Evaluating: 100%|██████████| 10/10 [00:07<00:00,  1.41it/s]


Epoch 8: Train Loss=2.4678, PPL=12.10 | Val Loss=2.6246, Val PPL=14.20


Training: 100%|██████████| 10/10 [00:11<00:00,  1.12s/it]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.49it/s]


Epoch 9: Train Loss=2.3389, PPL=10.83 | Val Loss=2.2642, Val PPL=10.20


Training: 100%|██████████| 10/10 [00:09<00:00,  1.08it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.51it/s]

Epoch 10: Train Loss=2.3553, PPL=11.04 | Val Loss=2.4906, Val PPL=12.77





# Test Parameters

In [6]:
BATCHES_PER_TEST = 1
GREEDY_DECODE = True
TEST_MAX_LEN = 256
TEST_TOP_P = 0.9
TEST_TEMPERATURE = 0.9

# Test

In [7]:
slice_test_loader = islice(test_loader, BATCHES_PER_TEST)
test_stats = evaluate(model, slice_test_loader, device, pad_id, num_batches=BATCHES_PER_TEST)
print(f"Test Loss={test_stats['val_loss']:.4f}, Test PPL={test_stats['val_ppl']:.2f}")

Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.62it/s]

Test Loss=2.0509, Test PPL=7.77





# Test Report Generation

In [8]:
with torch.no_grad():
    for pixel_values, ids_loader, paths, raw_labels in test_loader:
        pixel_values = pixel_values.to(device)
        gen_ids = model.generate(
            pixel_values=pixel_values,
            bos_id=bos_id, eos_id=eos_id,
            max_new_tokens=TEST_MAX_LEN, top_p=TEST_TOP_P, temperature=TEST_TEMPERATURE, greedy=GREEDY_DECODE
        )
        print("Predictions (first batch):")
        for i in range(gen_ids.size(0)):
            text_gen = tokenizer.decode(gen_ids[i].tolist())
            text_tgt = tokenizer.decode(ids_loader[i].tolist())
            print(f"\nGEN {i+1}:", text_gen)
            print(f"TGT {i+1}:", text_tgt)
            results = evaluate_all_metrics([text_tgt], [text_gen], evaluation_mode="CheXagent")
            for metric, scores in results.items():
                print(f"{metric}: {scores}")
        del pixel_values, ids_loader, paths, raw_labels, gen_ids
        torch.cuda.empty_cache()
        break

Predictions (first batch):

GEN 1: some some.
TGT 1: minimal biapical pleural thickening. slight blunting of the posterior left costophrenic angle. no other significant alterations.
Using device: cuda:0
chexbert_f1_weighted: 0.0
chexbert_f1_micro: 0.0
chexbert_f1_macro: 0.0
chexbert_f1_micro_5: 0.0
chexbert_f1_macro_5: 0.0
bertscore_f1: [0.2914072871208191]
radgraph_f1_RG_E: 0.0
radgraph_f1_RG_ER: 0.0
rouge_l: [0.0]

GEN 2: some some.
TGT 2: minimal biapical pleural thickening. slight blunting of the posterior left costophrenic angle. no other significant alterations.
Using device: cuda:0
chexbert_f1_weighted: 0.0
chexbert_f1_micro: 0.0
chexbert_f1_macro: 0.0
chexbert_f1_micro_5: 0.0
chexbert_f1_macro_5: 0.0
bertscore_f1: [0.2914072871208191]
radgraph_f1_RG_E: 0.0
radgraph_f1_RG_ER: 0.0
rouge_l: [0.0]

GEN 3: some some.
TGT 3: slight residual atelectasis in the right pulmonary base. minimal blunting of the costophrenic angle. chronic changes related to age. no infiltrates or pathologic