In [None]:
import os
os.chdir(os.path.dirname(os.getcwd()))
import pandas as pd
from itertools import islice
import torch
from torch.utils.data import DataLoader
from utils.text_metrics import evaluate_all_metrics
from utils.temp_utils import *
from utils.gpt_models import DinoGPTCaptioner, DinoGPT2Captioner
from utils.chexpert_dataset import CheXpertDataset
from utils.padchest_dataset import PadChestGRDataset

# Data

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

CSV_PATH = "Datasets/CheXpertPlus/df_chexpert_plus_240401.csv"
IMG_ROOT = "Datasets/CheXpertPlus/PNG"

CSV_PATH = os.path.join(os.getcwd(), CSV_PATH)
IMG_ROOT = os.path.join(os.getcwd(), IMG_ROOT)

TEXT_COL = "section_impression"
PATH_COL = "path_to_image"

IMG_SIZE = 516
MAX_LEN = 64
NUM_BATCH = 8

tf = dino_image_transform(img_size=IMG_SIZE)

ds_train = CheXpertDataset(img_root=IMG_ROOT, csv_path=CSV_PATH, split="train", transform=tf, text_col=TEXT_COL)
ds_valid = CheXpertDataset(img_root=IMG_ROOT, csv_path=CSV_PATH, split="valid", transform=tf, text_col=TEXT_COL)
ds_test = CheXpertDataset(img_root=IMG_ROOT, csv_path=CSV_PATH, split="test", transform=tf, text_col=TEXT_COL)

#labels = pd.read_csv(CSV_PATH)[TEXT_COL].tolist()

tokenizer = build_tokenizer_from_labels(captions=None)
pad_id = tokenizer.pad_token_id
eos_id = tokenizer.eos_token_id
bos_id = tokenizer.bos_token_id
collate_fn = CaptionCollate(tokenizer, pad_id)

train_loader = DataLoader(ds_train, batch_size=NUM_BATCH, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(ds_valid, batch_size=NUM_BATCH, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(ds_test, batch_size=NUM_BATCH, shuffle=False, collate_fn=collate_fn)

Using device: cuda
[INFO] Kept 47494/223462 rows with existing PNGs under C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\CheXpertPlus\PNG
[INFO] Kept 47494/223462 rows with existing PNGs under C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\CheXpertPlus\PNG
[INFO] Kept 47494/223462 rows with existing PNGs under C:\Users\emman\Desktop\PROYECTOS_VS_CODE\PRUEBAS_DE_PYTHON\CheXpertPlus\PNG


In [3]:
tokenizer_size = tokenizer.vocab_size
print("Tokenizer size:", tokenizer_size)

Tokenizer size: 58996


# Model

In [4]:
# DINO ViT-S/16 hidden size is 384 
EMBEDDING_D_IMG = 384
N_PREFIX = (IMG_SIZE // 16) ** 2  # number of visual prefix tokens (including CLS)

def pick_heads(d_model, target_head_dim=64):
    h = max(1, round(d_model / target_head_dim))
    while d_model % h != 0: h -= 1
    return h

D_MODEL = 768
N_HEAD = pick_heads(D_MODEL, 64)  # -> 12


model = DinoGPTCaptioner(
    vocab_size=tokenizer.vocab_size,
    d_img=EMBEDDING_D_IMG,
    pad_id=pad_id,
    d_model=D_MODEL,
    n_layer=12,
    n_head=N_HEAD,
    n_prefix=N_PREFIX,           # number of visual prefix tokens
    max_seq_len=512,
    dino_model_id="facebook/dinov3-vits16-pretrain-lvd1689m",
    freeze_dino=False,
).to(device)

# Print model parameters and trainable parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total model parameters: {total_params / 1_000_000:.2f} Millions")

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable model parameters: {trainable_params / 1_000_000:.2f} Millions")

# Print model footprint
model_footprint_in_gb = (total_params * 4) * (1e-9)  # assuming 4 bytes per parameter (float32)
print(f"Approximate model footprint: {model_footprint_in_gb:.2f} GB")

# after model init
model.decoder.lm_head.weight = model.decoder.tok_emb.weight  # weight tying

Total model parameters: 198.72 Millions
Trainable model parameters: 198.72 Millions
Approximate model footprint: 0.79 GB


# Train Parameters

In [5]:
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=3e-4, weight_decay=1e-2
)
loss = sequence_ce_loss
NUM_EPOCHS = 50
BATCHES_PER_EPOCH = 10

# Training

In [20]:
for epoch in range(NUM_EPOCHS):
    slice_train_loader = islice(train_loader, BATCHES_PER_EPOCH)
    slice_valid_loader = islice(valid_loader, BATCHES_PER_EPOCH)
    train_stats = train_one_epoch(model, slice_train_loader, optimizer, device, pad_id, num_batches=BATCHES_PER_EPOCH, loss_fn=loss, grad_clip=1.0)
    val_stats = evaluate(model, slice_valid_loader, device, pad_id, num_batches=BATCHES_PER_EPOCH, loss_fn=loss)
    print(f"Epoch {epoch + 1}: Train Loss={train_stats['loss']:.4f}, PPL={train_stats['ppl']:.2f} | "
            f"Val Loss={val_stats['val_loss']:.4f}, Val PPL={val_stats['val_ppl']:.2f}")

Training: 100%|██████████| 10/10 [00:48<00:00,  4.84s/it]
Evaluating: 100%|██████████| 10/10 [00:20<00:00,  2.02s/it]


Epoch 1: Train Loss=4.7610, PPL=120.07 | Val Loss=4.7468, Val PPL=119.05


Training: 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]
Evaluating: 100%|██████████| 10/10 [00:19<00:00,  1.95s/it]


Epoch 2: Train Loss=4.7215, PPL=114.84 | Val Loss=4.7281, Val PPL=117.16


Training: 100%|██████████| 10/10 [00:45<00:00,  4.50s/it]
Evaluating: 100%|██████████| 10/10 [00:23<00:00,  2.33s/it]


Epoch 3: Train Loss=4.8598, PPL=134.45 | Val Loss=4.7301, Val PPL=117.14


Training: 100%|██████████| 10/10 [00:45<00:00,  4.51s/it]
Evaluating: 100%|██████████| 10/10 [01:56<00:00, 11.68s/it]


Epoch 4: Train Loss=5.0043, PPL=153.38 | Val Loss=4.7111, Val PPL=114.74


Training: 100%|██████████| 10/10 [00:44<00:00,  4.48s/it]
Evaluating: 100%|██████████| 10/10 [00:20<00:00,  2.09s/it]


Epoch 5: Train Loss=4.8435, PPL=134.00 | Val Loss=4.7121, Val PPL=114.61


Training: 100%|██████████| 10/10 [00:41<00:00,  4.18s/it]
Evaluating: 100%|██████████| 10/10 [00:13<00:00,  1.34s/it]


Epoch 6: Train Loss=4.8298, PPL=129.45 | Val Loss=4.7035, Val PPL=113.88


Training: 100%|██████████| 10/10 [00:44<00:00,  4.44s/it]
Evaluating: 100%|██████████| 10/10 [00:17<00:00,  1.72s/it]


Epoch 7: Train Loss=4.8992, PPL=138.36 | Val Loss=4.6975, Val PPL=113.51


Training: 100%|██████████| 10/10 [00:38<00:00,  3.89s/it]
Evaluating: 100%|██████████| 10/10 [00:13<00:00,  1.36s/it]


Epoch 8: Train Loss=4.7911, PPL=122.39 | Val Loss=4.6901, Val PPL=112.54


Training: 100%|██████████| 10/10 [00:44<00:00,  4.44s/it]
Evaluating: 100%|██████████| 10/10 [00:15<00:00,  1.56s/it]


Epoch 9: Train Loss=4.7365, PPL=117.52 | Val Loss=4.6698, Val PPL=110.29


Training: 100%|██████████| 10/10 [00:39<00:00,  3.96s/it]
Evaluating: 100%|██████████| 10/10 [00:14<00:00,  1.41s/it]


Epoch 10: Train Loss=4.8974, PPL=138.35 | Val Loss=4.6649, Val PPL=109.80


Training: 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]
Evaluating: 100%|██████████| 10/10 [00:52<00:00,  5.25s/it]


Epoch 11: Train Loss=4.7048, PPL=112.76 | Val Loss=4.6704, Val PPL=110.63


Training: 100%|██████████| 10/10 [00:38<00:00,  3.90s/it]
Evaluating: 100%|██████████| 10/10 [00:11<00:00,  1.13s/it]


Epoch 12: Train Loss=4.7965, PPL=124.96 | Val Loss=4.6728, Val PPL=110.94


Training: 100%|██████████| 10/10 [00:38<00:00,  3.88s/it]
Evaluating: 100%|██████████| 10/10 [00:16<00:00,  1.60s/it]


Epoch 13: Train Loss=4.6231, PPL=105.02 | Val Loss=4.6567, Val PPL=109.12


Training: 100%|██████████| 10/10 [00:45<00:00,  4.52s/it]
Evaluating: 100%|██████████| 10/10 [00:17<00:00,  1.72s/it]


Epoch 14: Train Loss=4.7304, PPL=117.45 | Val Loss=4.6698, Val PPL=110.40


Training: 100%|██████████| 10/10 [00:42<00:00,  4.27s/it]
Evaluating: 100%|██████████| 10/10 [00:13<00:00,  1.35s/it]


Epoch 15: Train Loss=4.8487, PPL=129.94 | Val Loss=4.6670, Val PPL=109.54


Training: 100%|██████████| 10/10 [00:38<00:00,  3.88s/it]
Evaluating: 100%|██████████| 10/10 [00:18<00:00,  1.83s/it]


Epoch 16: Train Loss=4.6708, PPL=113.08 | Val Loss=4.6506, Val PPL=108.30


Training: 100%|██████████| 10/10 [00:41<00:00,  4.14s/it]
Evaluating: 100%|██████████| 10/10 [00:16<00:00,  1.64s/it]


Epoch 17: Train Loss=4.6635, PPL=109.31 | Val Loss=4.6418, Val PPL=107.24


Training: 100%|██████████| 10/10 [00:36<00:00,  3.69s/it]
Evaluating: 100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


Epoch 18: Train Loss=4.6933, PPL=111.92 | Val Loss=4.6498, Val PPL=108.21


Training: 100%|██████████| 10/10 [00:40<00:00,  4.07s/it]
Evaluating: 100%|██████████| 10/10 [01:43<00:00, 10.37s/it]


Epoch 19: Train Loss=4.6684, PPL=109.44 | Val Loss=4.6499, Val PPL=108.41


Training: 100%|██████████| 10/10 [00:47<00:00,  4.74s/it]
Evaluating: 100%|██████████| 10/10 [00:13<00:00,  1.39s/it]


Epoch 20: Train Loss=4.5450, PPL=95.52 | Val Loss=4.6350, Val PPL=107.10


Training: 100%|██████████| 10/10 [00:41<00:00,  4.19s/it]
Evaluating: 100%|██████████| 10/10 [00:34<00:00,  3.49s/it]


Epoch 21: Train Loss=4.7978, PPL=123.71 | Val Loss=4.6374, Val PPL=107.50


Training: 100%|██████████| 10/10 [00:46<00:00,  4.68s/it]
Evaluating: 100%|██████████| 10/10 [00:15<00:00,  1.52s/it]


Epoch 22: Train Loss=4.7054, PPL=112.02 | Val Loss=4.6244, Val PPL=105.58


Training: 100%|██████████| 10/10 [00:38<00:00,  3.88s/it]
Evaluating: 100%|██████████| 10/10 [00:09<00:00,  1.03it/s]


Epoch 23: Train Loss=4.6007, PPL=102.62 | Val Loss=4.6254, Val PPL=105.58


Training: 100%|██████████| 10/10 [00:48<00:00,  4.88s/it]
Evaluating: 100%|██████████| 10/10 [01:03<00:00,  6.38s/it]


Epoch 24: Train Loss=4.7604, PPL=119.37 | Val Loss=4.6189, Val PPL=105.48


Training: 100%|██████████| 10/10 [00:41<00:00,  4.12s/it]
Evaluating: 100%|██████████| 10/10 [00:20<00:00,  2.05s/it]


Epoch 25: Train Loss=4.6988, PPL=113.10 | Val Loss=4.6092, Val PPL=104.11


Training: 100%|██████████| 10/10 [00:43<00:00,  4.38s/it]
Evaluating: 100%|██████████| 10/10 [02:18<00:00, 13.88s/it]


Epoch 26: Train Loss=4.6553, PPL=108.65 | Val Loss=4.6184, Val PPL=105.53


Training: 100%|██████████| 10/10 [00:48<00:00,  4.87s/it]
Evaluating: 100%|██████████| 10/10 [00:18<00:00,  1.83s/it]


Epoch 27: Train Loss=4.6380, PPL=106.05 | Val Loss=4.5849, Val PPL=102.11


Training: 100%|██████████| 10/10 [00:43<00:00,  4.32s/it]
Evaluating: 100%|██████████| 10/10 [00:16<00:00,  1.61s/it]


Epoch 28: Train Loss=4.6827, PPL=112.49 | Val Loss=4.5837, Val PPL=101.80


Training: 100%|██████████| 10/10 [00:47<00:00,  4.76s/it]
Evaluating: 100%|██████████| 10/10 [00:31<00:00,  3.20s/it]


Epoch 29: Train Loss=4.6517, PPL=110.58 | Val Loss=4.5838, Val PPL=101.63


Training: 100%|██████████| 10/10 [00:51<00:00,  5.11s/it]
Evaluating: 100%|██████████| 10/10 [00:15<00:00,  1.54s/it]


Epoch 30: Train Loss=4.7553, PPL=119.81 | Val Loss=4.5757, Val PPL=100.99


Training: 100%|██████████| 10/10 [00:50<00:00,  5.03s/it]
Evaluating: 100%|██████████| 10/10 [00:13<00:00,  1.39s/it]


Epoch 31: Train Loss=4.6869, PPL=113.11 | Val Loss=4.5787, Val PPL=100.96


Training: 100%|██████████| 10/10 [00:46<00:00,  4.66s/it]
Evaluating: 100%|██████████| 10/10 [00:07<00:00,  1.33it/s]


Epoch 32: Train Loss=4.5550, PPL=97.32 | Val Loss=4.5692, Val PPL=100.08


Training: 100%|██████████| 10/10 [00:49<00:00,  4.96s/it]
Evaluating: 100%|██████████| 10/10 [00:15<00:00,  1.56s/it]


Epoch 33: Train Loss=4.5841, PPL=98.62 | Val Loss=4.5581, Val PPL=98.66


Training: 100%|██████████| 10/10 [00:41<00:00,  4.18s/it]
Evaluating: 100%|██████████| 10/10 [00:16<00:00,  1.70s/it]


Epoch 34: Train Loss=4.6271, PPL=103.76 | Val Loss=4.5624, Val PPL=99.10


Training: 100%|██████████| 10/10 [00:48<00:00,  4.84s/it]
Evaluating: 100%|██████████| 10/10 [00:28<00:00,  2.90s/it]


Epoch 35: Train Loss=4.6963, PPL=112.19 | Val Loss=4.5644, Val PPL=99.24


Training: 100%|██████████| 10/10 [00:41<00:00,  4.17s/it]
Evaluating: 100%|██████████| 10/10 [00:20<00:00,  2.06s/it]


Epoch 36: Train Loss=4.6197, PPL=103.06 | Val Loss=4.5605, Val PPL=98.88


Training: 100%|██████████| 10/10 [00:47<00:00,  4.72s/it]
Evaluating: 100%|██████████| 10/10 [00:15<00:00,  1.56s/it]


Epoch 37: Train Loss=4.6928, PPL=112.32 | Val Loss=4.5585, Val PPL=98.98


Training: 100%|██████████| 10/10 [00:44<00:00,  4.45s/it]
Evaluating: 100%|██████████| 10/10 [00:12<00:00,  1.25s/it]


Epoch 38: Train Loss=4.5633, PPL=96.20 | Val Loss=4.5560, Val PPL=98.77


Training: 100%|██████████| 10/10 [00:49<00:00,  4.99s/it]
Evaluating: 100%|██████████| 10/10 [00:56<00:00,  5.69s/it]


Epoch 39: Train Loss=4.6501, PPL=107.99 | Val Loss=4.5495, Val PPL=98.37


Training: 100%|██████████| 10/10 [00:45<00:00,  4.50s/it]
Evaluating: 100%|██████████| 10/10 [00:17<00:00,  1.71s/it]


Epoch 40: Train Loss=4.6091, PPL=103.76 | Val Loss=4.5501, Val PPL=98.44


Training: 100%|██████████| 10/10 [00:50<00:00,  5.05s/it]
Evaluating: 100%|██████████| 10/10 [00:21<00:00,  2.19s/it]


Epoch 41: Train Loss=4.5894, PPL=101.21 | Val Loss=4.5462, Val PPL=98.14


Training: 100%|██████████| 10/10 [00:42<00:00,  4.26s/it]
Evaluating: 100%|██████████| 10/10 [00:18<00:00,  1.87s/it]


Epoch 42: Train Loss=4.6601, PPL=107.12 | Val Loss=4.5446, Val PPL=97.53


Training: 100%|██████████| 10/10 [00:40<00:00,  4.06s/it]
Evaluating: 100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


Epoch 43: Train Loss=4.6248, PPL=103.08 | Val Loss=4.5510, Val PPL=98.56


Training: 100%|██████████| 10/10 [00:49<00:00,  4.99s/it]
Evaluating: 100%|██████████| 10/10 [00:16<00:00,  1.67s/it]


Epoch 44: Train Loss=4.6111, PPL=104.71 | Val Loss=4.5477, Val PPL=97.86


Training: 100%|██████████| 10/10 [00:47<00:00,  4.77s/it]
Evaluating: 100%|██████████| 10/10 [00:18<00:00,  1.88s/it]


Epoch 45: Train Loss=4.7683, PPL=122.06 | Val Loss=4.5317, Val PPL=96.47


Training: 100%|██████████| 10/10 [00:45<00:00,  4.57s/it]
Evaluating: 100%|██████████| 10/10 [00:19<00:00,  1.95s/it]


Epoch 46: Train Loss=4.6857, PPL=111.37 | Val Loss=4.5358, Val PPL=96.91


Training: 100%|██████████| 10/10 [00:42<00:00,  4.26s/it]
Evaluating: 100%|██████████| 10/10 [00:14<00:00,  1.43s/it]


Epoch 47: Train Loss=4.6399, PPL=106.16 | Val Loss=4.5364, Val PPL=96.77


Training: 100%|██████████| 10/10 [00:55<00:00,  5.50s/it]
Evaluating: 100%|██████████| 10/10 [00:16<00:00,  1.64s/it]


Epoch 48: Train Loss=4.8458, PPL=130.54 | Val Loss=4.5187, Val PPL=95.26


Training: 100%|██████████| 10/10 [00:46<00:00,  4.62s/it]
Evaluating: 100%|██████████| 10/10 [00:51<00:00,  5.17s/it]


Epoch 49: Train Loss=4.6191, PPL=103.34 | Val Loss=4.5166, Val PPL=94.87


Training: 100%|██████████| 10/10 [00:48<00:00,  4.89s/it]
Evaluating: 100%|██████████| 10/10 [00:15<00:00,  1.53s/it]

Epoch 50: Train Loss=4.5957, PPL=101.86 | Val Loss=4.5300, Val PPL=96.36





# Test Parameters

In [21]:
BATCHES_PER_TEST = 1
GREEDY_DECODE = True
TEST_MAX_LEN = 256
TEST_TOP_P = 0.9
TEST_TEMPERATURE = 0.9

# Test

In [22]:
slice_test_loader = islice(test_loader, BATCHES_PER_TEST)
test_stats = evaluate(model, slice_test_loader, device, pad_id, num_batches=BATCHES_PER_TEST)
print(f"Test Loss={test_stats['val_loss']:.4f}, Test PPL={test_stats['val_ppl']:.2f}")

Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]

Test Loss=4.3878, Test PPL=80.46





# Test Report Generation

In [23]:
# capitalize first word and first word after each "."
def capitalize_sentences(s):
    parts = s.split('. ')
    parts = [p[:1].upper() + p[1:] if p else '' for p in parts]
    return '. '.join(parts)

with torch.no_grad():
    for pixel_values, ids_loader, paths, raw_labels in test_loader:
        pixel_values = pixel_values.to(device)
        gen_ids = model.generate(
            pixel_values=pixel_values,
            bos_id=bos_id,
            eos_id=eos_id,
            max_new_tokens=TEST_MAX_LEN,
            beam_size=3,                # Set your desired beam size
            temperature=TEST_TEMPERATURE
        )

        info = model.generate_with_logging(
            pixel_values=pixel_values,          # [B, C, H, W]
            bos_id=tokenizer.bos_token_id,
            eos_id=tokenizer.eos_token_id,
            tokenizer=tokenizer,
            preset="safe_sample",
            stop_sequences=["\n\n", "Impression:"],
            max_new_tokens=256,
        )

        print("batch sequences shape:", info["sequences"].shape)
        for i, s in enumerate(info["per_sample"]):
            print(f"[sample {i}] hit_eos={s['stopping']['hit_eos']} repetition={s['repetition']}")
            if "generated" in s["text"]:
                print(capitalize_sentences(s["text"]["generated"]))
                print("[Target text]", capitalize_sentences(raw_labels[i]))

        eval_results = evaluate_all_metrics(raw_labels, [s["text"]["generated"] for s in info["per_sample"]], evaluation_mode="CheXagent")
        for metric, scores in eval_results.items():
            print(f"{metric}: {scores}")


        print("Predictions (first batch):")
        for i in range(gen_ids.size(0)):
            text_gen = tokenizer.decode(gen_ids[i].tolist())
            text_tgt = tokenizer.decode(ids_loader[i].tolist())
            print(f"\nGEN {i+1}:", capitalize_sentences(text_gen))
            print(f"TGT {i+1}:", capitalize_sentences(text_tgt))
            results = evaluate_all_metrics([text_tgt], [text_gen], evaluation_mode="CheXagent")
            for metric, scores in results.items():
                print(f"{metric}: {scores}")
        del pixel_values, ids_loader, paths, raw_labels, gen_ids
        torch.cuda.empty_cache()
        break

batch sequences shape: torch.Size([8, 85])
[sample 0] hit_eos=True repetition={'max_token_run': 1, 'max_repeat_trigram': 1, 'max_repeat_4gram': 1}
Interval placement of a right internal jugular catheter with tip at the cavoatrial junction. No evidence of pneumothorax is seen in place. Persistent bibasilar opacities left greater than left base atelectasis or consolidation versus aspiration.
[Target text] Interval placement of a right internal jugular venous sheath with the distal tip in the proximal superior vena cava. No pneumothorax. Stable position of nasogastric tube feeding tube tracheostomy canula left internal jugular central venous catheter and left upper extremity picc. No significant interval change in hyperexpanded lung volumes right basilar opacities small bilateral pleural effusions tenting of the right hemidiaphragm and biapical pleural thickening.
[sample 1] hit_eos=True repetition={'max_token_run': 1, 'max_repeat_trigram': 1, 'max_repeat_4gram': 1}
Single frontal radiogr

In [24]:
import re
import string

def clean_text(text: str) -> str:
    # lowercase
    text = text.lower()

    # remove enumerators like "1." or "23." but KEEP decimals like "2.5"
    text = re.sub(r'(?<!\d)\b\d+\.(?!\d)', ' ', text)

    # remove all punctuation EXCEPT "."
    punctuation = string.punctuation.replace('.', '')
    text = text.translate(str.maketrans('', '', punctuation))

    # normalize spaces around periods to ". " → ". "
    text = re.sub(r'\s*\.\s*', '. ', text)

    # collapse multiple spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()

    # capitalize first word and first word after each "."
    def capitalize_sentences(s):
        parts = s.split('. ')
        parts = [p[:1].upper() + p[1:] if p else '' for p in parts]
        return '. '.join(parts)
    text = capitalize_sentences(text)

    return text

# Example
text = "1.  STABLE SMALL LEFT INTERNAL JUGULAR OPACITIES... 2.5 cm nodule; item 2. next. 3. Done."
print(clean_text(text))


# Example
text = """
 1.  INTERVAL PLACEMENT OF A RIGHT INTERNAL JUGULAR VENOUS SHEATH 
WITH THE DISTAL TIP IN THE PROXIMAL SUPERIOR VENA CAVA.  NO 
PNEUMOTHORAX.
 
 2.  STABLE POSITION OF NASOGASTRIC TUBE, FEEDING TUBE, TRACHEOSTOMY 
CANULA, LEFT INTERNAL JUGULAR CENTRAL VENOUS CATHETER, AND LEFT UPPER 
EXTREMITY PICC.  
 
 3.  NO SIGNIFICANT INTERVAL CHANGE IN HYPEREXPANDED LUNG VOLUMES, 
RIGHT BASILAR OPACITIES, SMALL BILATERAL PLEURAL EFFUSIONS, TENTING 
OF THE RIGHT HEMIDIAPHRAGM AND BIAPICAL PLEURAL THICKENING. 
 
 """
cleaned_text = clean_text(text)
print(cleaned_text)


Stable small left internal jugular opacities. . . 2. 5 cm nodule item next. Done.
Interval placement of a right internal jugular venous sheath with the distal tip in the proximal superior vena cava. No pneumothorax. Stable position of nasogastric tube feeding tube tracheostomy canula left internal jugular central venous catheter and left upper extremity picc. No significant interval change in hyperexpanded lung volumes right basilar opacities small bilateral pleural effusions tenting of the right hemidiaphragm and biapical pleural thickening.


In [25]:
text = "1.  STABLE SMALL LEFT INTERNAL JUGULAR OPACITIES WITH PATCHY TUBE AND NASOGASTRIC TUBES, RIGHT LOWER MEDIASTINAL SIDED CATHETER.  NO SIGNIFICANT CHANGE IN THE PREVIOUS STUDYDEMONSTRATE ATELECTASIS O"
print("Original text:", text)
encoded = tokenizer.encode(text)
words = text.split()
print("Number of words:", len(words), "Number of tokens:", len(encoded), "pad_id:", pad_id, "eos_id:", eos_id, "bos_id:", bos_id)
print("BOS token id:", tokenizer.bos_token_id, "EOS token id:", tokenizer.eos_token_id, "PAD token id:", tokenizer.pad_token_id)
print(encoded)
for token_id in encoded:    
    print(f"Token ID: {token_id}, Token: {tokenizer.decode([token_id])}")

print("\nAfter lowercasing:")
textlower = cleaned_text
encoded = tokenizer.encode(textlower)
words = textlower.split()
print("Number of words:", len(words), "Number of tokens:", len(encoded), "pad_id:", pad_id, "eos_id:", eos_id, "bos_id:", bos_id)
print("BOS token id:", tokenizer.bos_token_id, "EOS token id:", tokenizer.eos_token_id, "PAD token id:", tokenizer.pad_token_id)
print(encoded)
for token_id in encoded:    
    print(f"Token ID: {token_id}, Token: {tokenizer.decode([token_id])}")

Original text: 1.  STABLE SMALL LEFT INTERNAL JUGULAR OPACITIES WITH PATCHY TUBE AND NASOGASTRIC TUBES, RIGHT LOWER MEDIASTINAL SIDED CATHETER.  NO SIGNIFICANT CHANGE IN THE PREVIOUS STUDYDEMONSTRATE ATELECTASIS O
Number of words: 27 Number of tokens: 43 pad_id: 0 eos_id: 102 bos_id: 101
BOS token id: 101 EOS token id: 102 PAD token id: 0
[101, 122, 119, 6111, 1353, 1286, 4422, 34986, 5552, 39280, 49176, 1114, 10085, 1183, 7159, 1105, 9468, 7301, 32519, 11182, 117, 1268, 2211, 2394, 34979, 7050, 11641, 5855, 30682, 119, 1185, 2418, 1849, 1107, 1103, 2166, 2025, 31386, 8756, 18465, 14229, 184, 102]
Token ID: 101, Token: 
Token ID: 122, Token: 1
Token ID: 119, Token: .
Token ID: 6111, Token: stable
Token ID: 1353, Token: small
Token ID: 1286, Token: left
Token ID: 4422, Token: internal
Token ID: 34986, Token: jug
Token ID: 5552, Token: ##ular
Token ID: 39280, Token: opa
Token ID: 49176, Token: ##cities
Token ID: 1114, Token: with
Token ID: 10085, Token: patch
Token ID: 1183, Token: ##y
T