In [2]:
!pip install --no-deps bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [3]:
#IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torchvision import models, transforms
from PIL import Image
import torch
import torch.nn as nn
import math
import json
from collections import Counter
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from bert_score import score

In [4]:
# reading training captions
file_path = '/kaggle/input/rocov2/ROCOv2/train_captions.csv'
data = pd.read_csv(file_path)

In [5]:
# reading testing captions
file_path_test = '/kaggle/input/rocov2/ROCOv2/test_captions.csv'
data_test = pd.read_csv(file_path_test)

In [6]:
data.head()

Unnamed: 0,ID,Caption
0,ROCOv2_2023_train_000001,Head CT demonstrating left parotiditis.
1,ROCOv2_2023_train_000002,Acquired renal cysts in end-stage renal failur...
2,ROCOv2_2023_train_000003,Computed tomography of the chest showing the r...
3,ROCOv2_2023_train_000004,Lateral view of the sacrum showing the low con...
4,ROCOv2_2023_train_000005,Thoracic CT scan showing perihilar pulmonary l...


In [7]:
data_test.head()

Unnamed: 0,ID,Caption
0,ROCOv2_2023_test_000001,CT chest axial view showing a huge ascending a...
1,ROCOv2_2023_test_000002,Computed tomography (CT) shows floating thromb...
2,ROCOv2_2023_test_000003,Digitally subtracted angiogram demonstrates ac...
3,ROCOv2_2023_test_000004,Digitally subtracted angiogram of the IMA demo...
4,ROCOv2_2023_test_000005,Angle measurement of a Type 1 canal.


In [8]:
captions = data['Caption'].tolist()
captions = captions[:10000]
print(captions[:5])
print(len(captions))

['Head CT demonstrating left parotiditis.', 'Acquired renal cysts in end-stage renal failure: 16-year-old girl with Alport syndrome and peritoneal dialysis from the age of 2\xa0years', 'Computed tomography of the chest showing the right breast nodule with irregular margins', 'Lateral view of the sacrum showing the low contrast between bone and soft tissue.', 'Thoracic CT scan showing perihilar pulmonary lymphadenomegaly']
10000


In [9]:
captions_test = data_test['Caption'].tolist()
captions_test = captions_test[:2000]
print(captions_test[:5])
print(len(captions_test))

['CT chest axial view showing a huge ascending aortic aneurysm (*).', 'Computed tomography (CT) shows floating thrombosis (white arrow)', 'Digitally subtracted angiogram demonstrates active extravasation of the superior rectal artery into the ileal-conduit (blue arrow)', 'Digitally subtracted angiogram of the IMA demonstrated cessation of flow through the proximal superior rectal artery in the region of the intersection between the artery and ureter with retained perfusion of the rectosigmoid region and resolution of active extravasation', 'Angle measurement of a Type 1 canal.']
2000


In [10]:
base_path = "/kaggle/input/rocov2/ROCOv2/train_images/train/"
image_ids = data['ID']
image_paths = [f"{base_path}{img_id}.jpg" for img_id in image_ids]
images = [Image.open(path) for path in image_paths]
images = images[:24000]

print(images[:5])

[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=682x748 at 0x78275F131210>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=307x224 at 0x78276016CFD0>, <PIL.JpegImagePlugin.JpegImageFile image mode=L size=358x263 at 0x78275F131A50>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=567x567 at 0x7828707381D0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=600x512 at 0x78275EAC3610>]


In [11]:
base_path_test = "/kaggle/input/rocov2/ROCOv2/test_images/test/"
image_ids_test = data_test['ID']
image_paths_test = [f"{base_path_test}{img_id}.jpg" for img_id in image_ids_test]
images_test = [Image.open(path) for path in image_paths_test]
images_test = images_test[:6000]

print(images_test[:5])

[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=653x658 at 0x78275048C5D0>, <PIL.JpegImagePlugin.JpegImageFile image mode=L size=598x669 at 0x78275048C510>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=896x977 at 0x78275048C990>, <PIL.JpegImagePlugin.JpegImageFile image mode=L size=896x875 at 0x78275048C450>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=782x516 at 0x78275048D510>]


In [12]:
# Load a pre-trained ResNet model and move it to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

resnet = models.resnet50(pretrained=True).to(device)
resnet = nn.Sequential(*list(resnet.children())[:-1])   # remove the classification layer to fit transformer
resnet.eval().to(device)

# Define a transformation pipeline for the images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to extract features from an image
def extract_features(image):
    # Ensure the image is in RGB format
    image = image.convert("RGB")
    
    # Apply transformations
    image_tensor = transform(image).unsqueeze(0).to(device)  # Add batch dimension and move to GPU
    
    # Extract features using the model
    with torch.no_grad():
        features = resnet(image_tensor)
    
    return features.squeeze().cpu().numpy()  # Move back to CPU for further processing

# Initialize lists to store features and captions
all_features = []
all_captions = []

# Extract features for all images and log to wandb
for idx, (image, caption) in enumerate(zip(images, captions)):
    features = extract_features(image)
    all_features.append(features)
    all_captions.append(caption)
    
# Save features and captions to local files
np.save("/kaggle/working/features_24000.npy", np.array(all_features))  # Save features as a NumPy array
with open("/kaggle/working/captions_24000.json", "w") as f:
    json.dump(all_captions, f)  # Save captions as a JSON file

# Initialize lists to store features and captions
all_features_test = []
all_captions_test = []

# Extract features for all images and log to wandb
for idx, (image, caption) in enumerate(zip(images_test, captions_test)):
    features_test = extract_features(image)
    all_features_test.append(features_test)
    all_captions_test.append(caption)
    
# Save features and captions to local files
np.save("/kaggle/working/features_6000_test.npy", np.array(all_features_test))  # Save features as a NumPy array
with open("/kaggle/working/captions_6000_test.json", "w") as f:
    json.dump(all_captions_test, f)  # Save captions as a JSON file

print("Features and captions saved locally.")

Using device: cuda


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 217MB/s]


Features and captions saved locally.


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

features = np.load("/kaggle/working/features_24000.npy")
features = torch.tensor(features).to(device)  # Convert to tensor and move to GPU
features = features.float()
print(features.shape)

Using device: cuda
torch.Size([10000, 2048])


In [14]:
features_test = np.load("/kaggle/working/features_6000_test.npy")
features_test = torch.tensor(features_test).to(device)  # Convert to tensor and move to GPU
features_test = features_test.float()
print(features_test.shape)

torch.Size([2000, 2048])


In [15]:
with open("/kaggle/working/captions_24000.json", "r") as f:
    captions = json.load(f)

In [16]:
with open("/kaggle/working/captions_6000_test.json", "r") as f:
    captions_test = json.load(f)

In [17]:
# Create vocabulary
vocab = []
all_captions = captions + captions_test
for caption in all_captions:
    words = caption.lower().split()
    vocab.extend(words)
word_count = Counter(vocab)

# Initialize special tokens and their indices
word2idx = {"<pad>":0, "<start>":1, "<end>":2, "<unk>":3}

# Add and assign index to each word from vocabulary
for idx, word in enumerate(word_count.keys(), start=len(word2idx)):
    word2idx[word] = idx

vocab_size = len(word2idx)
print(vocab_size)

22207


In [18]:
# Change each caption into list of word indices
token_ids_list  = []
for caption in captions:
    words = caption.lower().split()

    token_ids = []
    
    token_ids.append(word2idx["<start>"])

    for word in words:
        if word in word2idx:
            token_ids.append(word2idx[word])
        else:
            token_ids.append(word2idx["<unk>"])
            
    token_ids.append(word2idx["<end>"])
    
    token_ids_list.append(token_ids)

print(token_ids_list[:5])

[[1, 4, 5, 6, 7, 8, 2], [1, 9, 10, 11, 12, 13, 10, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 2], [1, 29, 30, 26, 24, 31, 32, 24, 33, 34, 35, 17, 36, 37, 2], [1, 38, 39, 26, 24, 40, 32, 24, 41, 42, 43, 44, 20, 45, 46, 2], [1, 47, 5, 48, 32, 49, 50, 51, 2]]


In [19]:
# Add padding
max_seq_len = max(len(seq) for seq in token_ids_list)
padded_token_ids_list = []
for seq in token_ids_list:
    padded_seq = seq + [word2idx["<pad>"]] * (max_seq_len - len(seq))
    padded_token_ids_list.append(padded_seq)

# Convert padded sequenced to tensor
token_ids = torch.tensor(padded_token_ids_list).to(device)

In [20]:
# Function giving information about the order of words
def positional_encoding(x, max_length = 5000):
    batch_size, seq_len, embed_dim = x.size()
    max_length = max(max_length, seq_len)
    
    pe_matrix = torch.zeros(max_length, embed_dim, device = x.device) # device = x.device avoids mixing tensors across CPU and GPU (Expected all tensors to be on the same device)

    # Generate position indices
    position = torch.arange(0, max_length, device = x.device).unsqueeze(1).float()
    
    # Dividing terms for sine (even idx) and cosine (odd idx)
    dividing_terms = torch.exp(torch.arange(0, embed_dim, 2, device = x.device).float() * (-math.log(10000) / embed_dim))
    pe_matrix[:, 0::2] = torch.sin(position * dividing_terms)
    pe_matrix[:, 1::2] = torch.cos(position * dividing_terms)

    # Add positional encoding to the input tensor
    x = x + pe_matrix[:seq_len, :].unsqueeze(0)
    return x

In [21]:
# Parameters
num_epochs = 10
learning_rate = 0.0001

embedding_size = 512
attention_head_num = 8
hidden_layers = 512
decoder_layers_num = 3
pad_token_id = 0

In [22]:
# Define model
encoder_projection = nn.Linear(2048, embedding_size).to(device)
caption_embedding = nn.Embedding(vocab_size, embedding_size).to(device)

embeddings = caption_embedding(token_ids)

decoder_layer = nn.TransformerDecoderLayer(d_model = embedding_size, nhead = attention_head_num, dim_feedforward = hidden_layers)
transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers = decoder_layers_num)

output_layer = nn.Linear(embedding_size, vocab_size).to(device)

In [23]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index = pad_token_id)
optimizer = optim.Adam(
    list(encoder_projection.parameters()) +
    list(caption_embedding.parameters()) +
    list(transformer_decoder.parameters()) +
    list(output_layer.parameters()),
    lr = learning_rate
)

In [24]:
# Function for tokenizing single caption
def tokenize_caption(caption, word2idx, max_len=200):
    tokens = caption.lower().split()
    token_ids = [word2idx.get("<start>")]
    for token in tokens:
        token_ids.append(word2idx.get(token, word2idx["<unk>"]))
    token_ids.append(word2idx.get("<end>"))
    
    if len(token_ids) < max_len:
        token_ids += [word2idx["<pad>"]] * (max_len - len(token_ids))
    else:
        token_ids = token_ids[:max_len]
    
    return token_ids

# Tokenize and pad all captions
captions_tokenized = [tokenize_caption(caption, word2idx, max_len=200) for caption in captions]
captions_tensor = torch.tensor(captions_tokenized)

captions_tokenized_test = [tokenize_caption(caption, word2idx, max_len=200) for caption in captions_test]
captions_tensor_test = torch.tensor(captions_tokenized_test)

In [25]:
# Create TensorDatasets for train and test data
train_dataset = TensorDataset(features, captions_tensor)
test_dataset = TensorDataset(torch.tensor(features_test), torch.tensor(captions_tensor_test))

  test_dataset = TensorDataset(torch.tensor(features_test), torch.tensor(captions_tensor_test))


In [26]:
# DataLoader for batching
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [27]:
print("Captions tensor min:", captions_tensor.min())
print("Captions tensor max:", captions_tensor.max())
print("Vocab size:", vocab_size)
# captions_tensor.max() should be smaller than vocab_size

Captions tensor min: tensor(0)
Captions tensor max: tensor(19413)
Vocab size: 22207


In [28]:
def print_device(tensor, name="Tensor"):
    print(f"{name} is on device: {tensor.device}")

# Move models to GPU
encoder_projection = encoder_projection.to(device)
caption_embedding = caption_embedding.to(device)
transformer_decoder = transformer_decoder.to(device)
output_layer = output_layer.to(device)

# Function to run one epoch
def run_one_epoch(loader, is_train=True):
    #traing mode
    if is_train:
        encoder_projection.train()
        caption_embedding.train()
        transformer_decoder.train()
        output_layer.train()
    else: # evaluating mode
        encoder_projection.eval()
        caption_embedding.eval()
        transformer_decoder.eval()
        output_layer.eval()

    total_loss = 0.0

    for features_batch, captions_batch in loader:

        features_batch = features_batch.to(device)
        captions_batch = captions_batch.to(device)

        # Define inputs (all tokens except last) and targets (all tokens except first)
        inputs = captions_batch[:, :-1]
        targets = captions_batch[:, 1:]

        # Project features and add sequence dimension
        features_encoded = encoder_projection(features_batch)
        features_encoded = features_encoded.unsqueeze(1)

        # Embed caption  inputs and add positional encoding
        captions_embedding = caption_embedding(inputs) 
        captions_embedding = positional_encoding(captions_embedding)

        # Prepare memory
        memory = features_encoded.permute(1, 0, 2)

        # Move memory to GPU
        memory = memory.to(device)

        # Prepare target input for decoder
        tgt = captions_embedding.permute(1, 0, 2)

        # Move target in to GPU
        tgt = tgt.to(device)

        # Decoder output
        output = transformer_decoder(tgt=tgt, memory=memory)
        output = output_layer(output)
        output = output.permute(1, 0, 2)
        
        # Flatten for loss
        output = output.reshape(-1, vocab_size)
        targets = targets.reshape(-1)

        loss = criterion(output, targets)

        if is_train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


# Training process
best_loss = float('inf')

for epoch in range(num_epochs):
    train_loss = run_one_epoch(train_loader, is_train=True)
    
    # Save the model if the loss is better
    if train_loss < best_loss:
        best_loss = train_loss
        torch.save({
            'encoder_projection': encoder_projection.state_dict(),
            'caption_embedding': caption_embedding.state_dict(),
            'transformer_decoder': transformer_decoder.state_dict(),
            'output_layer': output_layer.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch,
            }, "/kaggle/working/best_model.pth")
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}")

# Test the model
test_loss = run_one_epoch(test_loader, is_train=False)
print(f"Test Loss: {test_loss:.4f}")

Epoch [1/10], Train Loss: 7.3410
Epoch [2/10], Train Loss: 6.1477
Epoch [3/10], Train Loss: 5.4428
Epoch [4/10], Train Loss: 4.7813
Epoch [5/10], Train Loss: 4.1733
Epoch [6/10], Train Loss: 3.5728
Epoch [7/10], Train Loss: 2.9994
Epoch [8/10], Train Loss: 2.5115
Epoch [9/10], Train Loss: 2.1082
Epoch [10/10], Train Loss: 1.7725
Test Loss: 2.5277


In [29]:
#Save model
save_path = "/kaggle/working/CNN+Transformer_model.pth"

torch.save({
    'encoder_projection': encoder_projection.state_dict(),
    'caption_embedding': caption_embedding.state_dict(),
    'transformer_decoder': transformer_decoder.state_dict(),
    'output_layer': output_layer.state_dict(),
    'optimizer': optimizer.state_dict(),  # (optional) save optimizer too
    'epoch': epoch,
}, save_path)

print(f"Model saved to {save_path}")

Model saved to /kaggle/working/CNN+Transformer_model.pth


In [30]:
# Load the saved model
checkpoint = torch.load('/kaggle/working/CNN+Transformer_model.pth', map_location=device)

encoder_projection.load_state_dict(checkpoint['encoder_projection'])
caption_embedding.load_state_dict(checkpoint['caption_embedding'])
transformer_decoder.load_state_dict(checkpoint['transformer_decoder'])
output_layer.load_state_dict(checkpoint['output_layer'])

# Move models to GPU
encoder_projection = encoder_projection.to(device)
caption_embedding = caption_embedding.to(device)
transformer_decoder = transformer_decoder.to(device)
output_layer = output_layer.to(device)

  checkpoint = torch.load('/kaggle/working/CNN+Transformer_model.pth', map_location=device)


In [31]:
# Create index-to-word dictionary
idx2word = {idx: word for word, idx in word2idx.items()}

# Decode tokes IDs into text
def decode_caption(caption_ids):
    words = []
    for idx in caption_ids:
        if idx == word2idx['<end>']:
            break
        if idx in (word2idx['<pad>'], word2idx['<start>']):
            continue
        words.append(idx2word.get(idx, "<unk>"))
    return ' '.join(words)

# Set the model to evaluation mode
encoder_projection.eval()
caption_embedding.eval()
transformer_decoder.eval()
output_layer.eval()

real_captions = []
predicted_captions = []

# Generate predictions on test set
with torch.no_grad():
    for features_batch, captions_batch in test_loader:
        features_batch = features_batch.to(device)
        captions_batch = captions_batch.to(device)

        inputs = captions_batch[:, :-1]
        targets = captions_batch[:, 1:]

        # Forward pass
        features_encoded = encoder_projection(features_batch)
        features_encoded = features_encoded.unsqueeze(1)

        captions_embedding = caption_embedding(inputs)
        captions_embedding = positional_encoding(captions_embedding)

        memory = features_encoded.permute(1, 0, 2)
        tgt = captions_embedding.permute(1, 0, 2)

        output = transformer_decoder(tgt=tgt, memory=memory)
        output = output_layer(output)
        output = output.permute(1, 0, 2)

        # Get most probable token
        _, predicted_indices = torch.max(output, dim=-1)

        for i in range(features_batch.size(0)):  # Limit to 5 samples
            real_caption = decode_caption(captions_batch[i].cpu().tolist())
            predicted_caption = decode_caption(predicted_indices[i].cpu().tolist())

            real_captions.append(real_caption)
            predicted_captions.append(predicted_caption)

        break

# Save original and predicted captions
captions_df = pd.DataFrame({
    'real_caption': real_captions,
    'predicted_caption': predicted_captions
})

# Save to CSV file
csv_path = "/kaggle/working/captions.csv"
captions_df.to_csv(csv_path, index=False)

In [32]:
captions_df.head()

Unnamed: 0,real_caption,predicted_caption
0,ct chest axial view showing a huge ascending a...,ct chest axial view showing a huge ascending a...
1,computed tomography (ct) shows floating thromb...,computed tomography (ct) shows thrombus thromb...
2,digitally subtracted angiogram demonstrates ac...,post-operative thorax angiogram demonstrates a...
3,digitally subtracted angiogram of the ima demo...,post-operative tomographic angiogram of the pa...
4,angle measurement of a type 1 canal.,angle measurement of a type 1 month


In [33]:
# Compute BERTScore
P, R, F1 = score(predicted_captions, real_captions, lang="en", verbose=True)

print(f"Average BERTScore F1: {F1.mean().item():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2025-04-28 11:57:01.027667: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745841421.188957      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745841421.237070      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.89 seconds, 71.61 sentences/sec
Average BERTScore F1: 0.9355


In [34]:
print(f"BERTScore: Precision={P.mean().item():.4f}, Recall={R.mean().item():.4f}, F1={F1.mean().item():.4f}")

BERTScore: Precision=0.9432, Recall=0.9282, F1=0.9355


In [36]:
# Compute MedBERTScore
P_med, R_med, F1_med = score(
    predicted_captions,
    real_captions,
    model_type="emilyalsentzer/Bio_ClinicalBERT",
    num_layers=8,
    lang="en",
    rescale_with_baseline=True
)
print(f"MedBERTScore: Precision={P_med.mean().item():.4f}, Recall={R_med.mean().item():.4f}, F1={F1_med.mean().item():.4f}")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

MedBERTScore: Precision=0.8713, Recall=0.8234, F1=0.8460




model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [37]:
# Save results to txt file
results = {
    "BERTScore Precision": P.mean().item(),
    "BERTScore Recall": R.mean().item(),
    "BERTScore F1": F1.mean().item(),
    "MedBERTScore Precision": P_med.mean().item(),
    "MedBERTScore Recall": R_med.mean().item(),
    "MedBERTScore F1": F1_med.mean().item(),
}

results_path = "/kaggle/working/results_CNNTransformer.txt"

with open(results_path, "w") as f:
    for metric, value in results.items():
        f.write(f"{metric}: {value:.4f}\n")

print(f"Results saved to {results_path}")


Results saved to /kaggle/working/results_CNNTransformer.txt
