In [1]:
import pandas as pd
from itertools import islice
import torch
from torch.utils.data import DataLoader
import sys
sys.path.append(r"C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\Chest-X-ray-Diagnosis-Automated-Reporting-using-CNNs-and-LLMs---UDEM-PEF-Thesis-Fall-2025")

from utils.text_metrics import evaluate_all_metrics
from utils.temp_utils import *
from utils.gpt_models import DinoGPTCaptioner, DinoGPT2Captioner
from utils.chexpert_dataset import CheXpertDataset
from utils.padchest_dataset import PadChestGRDataset

# Data

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

df = pd.read_csv(r"C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\PadChest-GR\master_table.csv")   # must contain column 'ImageID'
root_dir = r'C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\PadChest-GR\PadChest_GR'
json_file = r"C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\PadChest-GR\grounded_reports_20240819.json"

df_train = df[df['split'] == 'train']
df_validation = df[df['split'] == 'validation']
df_test = df[df['split'] == 'test']


IMG_SIZE = 224
MAX_LEN = 64
NUM_BATCH = 8

tf = dino_image_transform(img_size=IMG_SIZE)

ds_train = PadChestGRDataset(
        dataframe=df_train,
        root_dir=root_dir,
        json_file=json_file,
        max_txt_len=MAX_LEN,
        image_size=IMG_SIZE,
        normalize=True,
        transform=None,
        return_paths=False,
        sentence_key="sentence_en",
    )

ds_valid = PadChestGRDataset(
        dataframe=df_validation,
        root_dir=root_dir,
        json_file=json_file,
        max_txt_len=MAX_LEN,
        image_size=IMG_SIZE,
        normalize=True,
        transform=None,
        return_paths=False,
        sentence_key="sentence_en",
    )

ds_test = PadChestGRDataset(
        dataframe=df_test,
        root_dir=root_dir,
        json_file=json_file,
        max_txt_len=MAX_LEN,
        image_size=IMG_SIZE,
        normalize=True,
        transform=None,
        return_paths=False,
        sentence_key="sentence_en",
    )

tokenizer = build_tokenizer_from_labels()
pad_id = tokenizer.pad_token_id
eos_id = tokenizer.eos_token_id
bos_id = tokenizer.bos_token_id
collate_fn = CaptionCollate(tokenizer, pad_id)

train_loader = DataLoader(ds_train, batch_size=NUM_BATCH, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(ds_valid, batch_size=NUM_BATCH, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(ds_test, batch_size=NUM_BATCH, shuffle=False, collate_fn=collate_fn)

Using device: cuda


# Model

In [3]:
# DINO ViT-S/16 hidden size is 384 
EMBEDDING_D_IMG = 384
N_PREFIX = (IMG_SIZE // 16) ** 2  # number of visual prefix tokens (including CLS)

model = DinoGPTCaptioner(
    vocab_size=tokenizer.vocab_size,
    d_img=EMBEDDING_D_IMG,
    pad_id=pad_id,
    d_model=512,
    n_layer=8,
    n_head=8,
    n_prefix=N_PREFIX,           # number of visual prefix tokens
    max_seq_len=256,
    dino_model_id="facebook/dinov3-vits16-pretrain-lvd1689m",
    freeze_dino=True,
).to(device)

# Train Parameters

In [4]:
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=3e-4, weight_decay=1e-2
)
loss = sequence_ce_loss
NUM_EPOCHS = 10
BATCHES_PER_EPOCH = 10

# Training

In [5]:
for epoch in range(NUM_EPOCHS):
    slice_train_loader = islice(train_loader, BATCHES_PER_EPOCH)
    slice_valid_loader = islice(valid_loader, BATCHES_PER_EPOCH)
    train_stats = train_one_epoch(model, slice_train_loader, optimizer, device, pad_id, num_batches=BATCHES_PER_EPOCH, loss_fn=loss, grad_clip=1.0)
    val_stats = evaluate(model, slice_valid_loader, device, pad_id, num_batches=BATCHES_PER_EPOCH, loss_fn=loss)
    print(f"Epoch {epoch + 1}: Train Loss={train_stats['loss']:.4f}, PPL={train_stats['ppl']:.2f} | "
            f"Val Loss={val_stats['val_loss']:.4f}, Val PPL={val_stats['val_ppl']:.2f}")

Training: 100%|██████████| 10/10 [00:07<00:00,  1.28it/s]
Evaluating: 100%|██████████| 10/10 [00:07<00:00,  1.39it/s]


Epoch 1: Train Loss=9.6252, PPL=18619.41 | Val Loss=8.6538, Val PPL=5755.87


Training: 100%|██████████| 10/10 [00:08<00:00,  1.20it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.44it/s]


Epoch 2: Train Loss=7.9779, PPL=3376.97 | Val Loss=7.2352, Val PPL=1432.64


Training: 100%|██████████| 10/10 [00:07<00:00,  1.27it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.54it/s]


Epoch 3: Train Loss=6.7528, PPL=895.12 | Val Loss=6.7752, Val PPL=909.21


Training: 100%|██████████| 10/10 [00:07<00:00,  1.28it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


Epoch 4: Train Loss=6.3076, PPL=569.79 | Val Loss=6.4130, Val PPL=616.37


Training: 100%|██████████| 10/10 [00:07<00:00,  1.31it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


Epoch 5: Train Loss=6.1533, PPL=477.88 | Val Loss=5.9963, Val PPL=419.15


Training: 100%|██████████| 10/10 [00:07<00:00,  1.42it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.50it/s]


Epoch 6: Train Loss=6.0381, PPL=430.03 | Val Loss=5.9613, Val PPL=409.76


Training: 100%|██████████| 10/10 [00:07<00:00,  1.25it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.51it/s]


Epoch 7: Train Loss=5.9183, PPL=402.60 | Val Loss=5.7845, Val PPL=330.52


Training: 100%|██████████| 10/10 [00:07<00:00,  1.29it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


Epoch 8: Train Loss=5.7416, PPL=324.47 | Val Loss=5.5605, Val PPL=269.05


Training: 100%|██████████| 10/10 [00:07<00:00,  1.29it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.43it/s]


Epoch 9: Train Loss=5.4962, PPL=249.71 | Val Loss=5.4478, Val PPL=239.39


Training: 100%|██████████| 10/10 [00:07<00:00,  1.41it/s]
Evaluating: 100%|██████████| 10/10 [00:06<00:00,  1.64it/s]

Epoch 10: Train Loss=5.4669, PPL=247.51 | Val Loss=5.2507, Val PPL=197.69





# Test Parameters

In [6]:
BATCHES_PER_TEST = 1
GREEDY_DECODE = True
TEST_MAX_LEN = 256
TEST_TOP_P = 0.9
TEST_TEMPERATURE = 0.9

# Test

In [7]:
slice_test_loader = islice(test_loader, BATCHES_PER_TEST)
test_stats = evaluate(model, slice_test_loader, device, pad_id, num_batches=BATCHES_PER_TEST)
print(f"Test Loss={test_stats['val_loss']:.4f}, Test PPL={test_stats['val_ppl']:.2f}")

Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]

Test Loss=4.7285, Test PPL=113.13





# Test Report Generation

In [8]:
with torch.no_grad():
    for pixel_values, ids_loader, paths, raw_labels in test_loader:
        pixel_values = pixel_values.to(device)
        gen_ids = model.generate(
            pixel_values=pixel_values,
            bos_id=bos_id, eos_id=eos_id,
            max_new_tokens=TEST_MAX_LEN, top_p=TEST_TOP_P, temperature=TEST_TEMPERATURE, greedy=GREEDY_DECODE
        )
        print("Predictions (first batch):")
        for i in range(gen_ids.size(0)):
            text_gen = tokenizer.decode(gen_ids[i].tolist())
            text_tgt = tokenizer.decode(ids_loader[i].tolist())
            print(f"\nGEN {i+1}:", text_gen)
            print(f"TGT {i+1}:", text_tgt)
            results = evaluate_all_metrics([text_tgt], [text_gen], evaluation_mode="CheXagent")
            for metric, scores in results.items():
                print(f"{metric}: {scores}")
        del pixel_values, ids_loader, paths, raw_labels, gen_ids
        torch.cuda.empty_cache()
        break

Predictions (first batch):

GEN 1: no significant right pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleuralome right pleural pleuralome right pleural pleural pleural pleuralome right pleuralome right pleural pleural of the right pleural pleuralome right of the right of the righty.
TGT 1: minimal biapical pleural thickening. slight blunting of the posterior left costophrenic angle. no other significant alterations.
Using device: cuda:0
chexbert_f1_weighted: 0.0
chexbert_f1_micro: 0.0
chexbert_f1_macro: 0.0
chexbert_f1_micro_5: 0.0
chexbert_f1_macro_5: 0.0
bertscore_f1: [0.5536412000656128]
radgraph_f1_RG_E: 0.0
radgraph_f1_RG_ER: 0.0
rouge_l: [0.08571428571428572]

GEN 2: no significant right pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural pleural