# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install rouge_score
!pip install bert_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=933c87a45a5c033d6ba24c43fe460becc7e9d65301f0255175aa56a312812125
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12

In [None]:
import os
import sys
import json
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, models

from transformers import (
    ViTModel,
    BertModel,
    BertTokenizer,
    AutoModel,
    AutoTokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
    DetrForObjectDetection,
    DetrImageProcessor,
    VivitModel,
    AdamW,
    get_linear_schedule_with_warmup,
)
from transformers.modeling_outputs import BaseModelOutput
from transformers import logging

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Update with your own save path
save_dir = "/content/drive/My Drive/Master Thesis/CholecT50"

# Load datasets and create dataloaders

In [None]:
dataset = []
for i in range(5):
  d =  torch.load(f"{save_dir}/Datasets/frame_dataset_{int(i*10)}_{(int(i+1)*10)}.pt")
  print(i)
  dataset.extend(d)

0
1
2
3
4


In [None]:
print(dataset[0].keys())
print(len(dataset))
print(type(dataset[0]["frame_caption"]), dataset[0]["frame_caption"])
print(type(dataset[0]["frame"]), dataset[0]["frame"].shape)
print(type(dataset[0]["objects"]), dataset[0]["objects"])

dict_keys(['video', 'frame_number', 'frame', 'object_labels', 'objects', 'frame_caption'])
89827
<class 'str'>  During phase preparation, the grasper is grasping the gallbladder
<class 'torch.Tensor'> torch.Size([3, 224, 224])
<class 'list'> ['grasper', 'gallbladder']


In [None]:
def collate_fn(batch):
    max_objects = 10
    return {
        'video': [item['video'] for item in batch],
        'frame_number': [item['frame_number'] for item in batch],
        'frame': torch.stack([item['frame'] for item in batch]),
        'frame_caption': [item['frame_caption'] for item in batch],
        'objects': [
            item['objects'] + [''] * (max_objects - len(item['objects'])) if len(item['objects']) < max_objects
            else item['objects'][:max_objects] for item in batch
        ]
    }

In [None]:
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)





batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)


# First Training

This section initializes the model and trains the model using the ground-truth objects for the dataset

## model

In [None]:



class FrameCaptioner(nn.Module):
    def __init__(self):
        super().__init__()

        # Frame encoder
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.video_proj = nn.Linear(768, 512)

        # Object encoder
        self.text_encoder = AutoModel.from_pretrained("distilbert-base-uncased")
        self.text_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        self.text_proj = nn.Linear(768, 512)

        # Text decoder
        self.decoder = T5ForConditionalGeneration.from_pretrained("t5-small")
        self.decoder_tokenizer = T5Tokenizer.from_pretrained("t5-small")

    def forward(self, frame, objects, frame_caption=None):
        batch_size = frame.shape[0]

        # frame
        frame_features = self.vit(frame).last_hidden_state
        frame_features = self.video_proj(frame_features)

        # object
        object_texts = [" ".join(obj_list) for obj_list in objects]
        tokenized_objects = self.text_tokenizer(
            object_texts, padding=True, truncation=True, return_tensors="pt"
        ).to(frame.device)
        text_features = self.text_encoder(**tokenized_objects).last_hidden_state
        text_features = self.text_proj(text_features)

        # fuse
        combined_features = torch.cat((frame_features, text_features), dim=1)

        # training
        if frame_caption is not None:
            target_ids = self.decoder_tokenizer(
                frame_caption, padding=True, truncation=True, return_tensors="pt"
            ).input_ids.to(frame.device)

            outputs = self.decoder(
                encoder_outputs=(combined_features,),
                labels=target_ids
            )
            print(outputs.logits.shape)
            return outputs
        # generation
        else:
            input_ids = torch.ones(batch_size, 1).fill_(self.decoder_tokenizer.pad_token_id).to(frame.device)
            output = self.decoder.generate(
                input_ids=input_ids,
                encoder_outputs=BaseModelOutput(last_hidden_state=combined_features),
                max_length=64,
                temperature=0.2,
                top_k=10,
                top_p=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
            )
            generated_captions = [self.decoder_tokenizer.decode(seq, skip_special_tokens=True) for seq in output]
            return generated_captions


## Train

In [None]:

def run_epoch(model, loader, optimizer=None, train=True, temperature=1.0):
    mode = "Training" if train else "Validation"
    model.train() if train else model.eval()
    total_loss = 0
    total_batches = len(loader)

    for batch_idx, batch in enumerate(loader):
        frame = batch['frame'].to(device)
        target = batch['frame_caption']
        objects = batch['objects']

        if train:
            optimizer.zero_grad()
            outputs = model(frame, objects, target)
            loss = outputs.loss / temperature
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if scheduler:
                scheduler.step()
        else:
            with torch.no_grad():
                outputs = model(frame, objects, target)
                loss = outputs.loss / temperature

        total_loss += loss.item()

        # Print progress bar
        progress = (batch_idx + 1) / total_batches
        bar_length = 20
        filled_length = int(bar_length * progress)
        bar = "=" * filled_length + " " * (bar_length - filled_length)
        percentage = int(progress * 100)
        sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches} - Loss: {loss.item():.4f} - Avg {mode} Loss: {total_loss / (batch_idx + 1):.4f}")
        sys.stdout.flush()

    print()
    return total_loss / len(loader)


# Initialize
model = FrameCaptioner().to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
num_epochs = 15
temperature = 2.0
num_training_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


# Train loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}: ")
    loss = run_epoch(model, train_loader, optimizer, train=True, temperature=temperature)
    val_loss = run_epoch(model, val_loader, train=False, temperature=temperature)
    #torch.save(model.state_dict(), f"{save_dir}/Models/model_FC.pth")

## Print examples

In [None]:
model = FrameCaptioner().to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_FC.pth"))

In [None]:
def print_examples(model, loader, device, num_examples = 5):

    model.eval()
    with torch.no_grad():
        i = 1
        for batch in test_loader:

            frame = batch['frame'].to(device)
            objects = batch['objects']
            target = batch['frame_caption']

            predicted_caption = model(frame, objects)

            print(f"Predicted: {predicted_caption}")
            print(f"Target: {target[0]}")
            print("="*50)
            i += 1
            if i > num_examples:
                break

print_examples(model, test_loader, device, 10)

Predicted: During phase carlot-triangle-dissection, the grasper is grasping the gallbladder, their bipolar is dissecting the cystic_artery
Target:  During phase carlot-triangle-dissection, the bipolar is dissecting the cystic_artery, the grasper is grasping the gallbladder
Predicted: During phase gallbladder-packaging, the grasper is grasping the specimen_bag
Target:  During phase gallbladder-packaging, the grasper is grasping the specimen_bag
Predicted: During phase gallbladder-dissection, the hook is dissecting the gallbloddger
Target:  During phase gallbladder-dissection, the hook is dissecting the gallbladder
Predicted: During phase carlot-triangle-dissection, the grasper is retracting the gallbladder, this hook is present
Target:  During phase carlot-triangle-dissection, the grasper is retracting the gallbladder, the hook is present
Predicted: During phase gallbladder-dissection, the grasper is retracting the gallbleddger, this hook is present
Target:  During phase gallbladder-dis

# Train with Detected Objects

The trained model is loaded and trained again using the objects detected by the OD model stored in predicted_objects.json

In [None]:
with open(f"{save_dir}/Predictions/predicted_objects.json", "r") as f:
        predicted_objects_dataset= json.load(f)

In [None]:
def run_epoch(model, loader, optimizer=None, train=True, temperature=1.0):
    mode = "Training" if train else "Validation"
    model.train() if train else model.eval()
    total_loss = 0
    total_batches = len(loader)

    for batch_idx, batch in enumerate(loader):
        frame = batch['frame'].to(device)
        target = batch['frame_caption']
        video_folder = batch['video']
        frame_name = batch['frame_number']

        predicted_objects = [predicted_objects_dataset[video_folder[i]][frame_name[i]]["predicted_objects"]for i in range(len(video_folder)) ]


        if train:
            optimizer.zero_grad()
            outputs = model(frame, predicted_objects, target)
            loss = outputs.loss / temperature
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if scheduler:
                scheduler.step()
        else:
            with torch.no_grad():
                outputs = model(frame, predicted_objects, target)
                loss = outputs.loss / temperature
        total_loss += loss.item()

        # Print progress bar
        progress = (batch_idx + 1) / total_batches
        bar_length = 20
        filled_length = int(bar_length * progress)
        bar = "=" * filled_length + " " * (bar_length - filled_length)
        percentage = int(progress * 100)
        sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches} - Loss: {loss.item():.4f} - Avg {mode} Loss: {total_loss / (batch_idx + 1):.4f}")
        sys.stdout.flush()

    print()
    return total_loss / len(loader)



# Initialize
model = FrameCaptioner().to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
num_epochs = 10
temperature = 2.0
num_training_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

model.load_state_dict(torch.load(f"{save_dir}/model_FC.pth"))


# Train Loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}: ")
    loss = run_epoch(model, train_loader, optimizer, train=True, temperature=temperature)
    val_loss = run_epoch(model, val_loader, train=False, temperature=temperature)
    torch.save(model.state_dict(), f"{save_dir}/Models/model_FC_robust.pth")


  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/model_FC.pth"))


Epoch 1/10: 
Epoch 2/10: 
Epoch 3/10: 
Epoch 4/10: 
Epoch 5/10: 
Epoch 6/10: 
Epoch 7/10: 
Epoch 8/10: 
Epoch 9/10: 
Epoch 10/10: 


# Compare

In [None]:


def evaluate_model(model, loader, device):
    model.eval()
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_batches = len(loader)
    smooth_fn = SmoothingFunction().method1
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    bleu_scores = []
    bert_precision, bert_recall, bert_f1 = [], [], []

    with torch.no_grad():
        for batch_idx, batch in enumerate(loader):
            frame = batch['frame'].to(device)
            target = batch['frame_caption']
            video_folder = batch['video']
            frame_name = batch['frame_number']



            predicted_objects = [predicted_objects_dataset[video_folder[i]][frame_name[i]]["predicted_objects"]for i in range(len(video_folder)) ]
            predicted_caption = model(frame, predicted_objects)

            # Compute ROUGE scores
            scores = rouge.score(predicted_caption, target)
            for key in rouge_scores:
                rouge_scores[key].append(scores[key].fmeasure)

            # Compute BLEU score
            reference = [target.split()]
            hypothesis = predicted_caption.split()
            bleu = sentence_bleu(reference, hypothesis, smoothing_function=smooth_fn)
            bleu_scores.append(bleu)

            # Compute BERTScore
            logging.set_verbosity_error()
            P, R, F1 = bert_score([predicted_caption], [target], lang="en", rescale_with_baseline=True)
            bert_precision.append(P.item())
            bert_recall.append(R.item())
            bert_f1.append(F1.item())


            # Calculate progress
            progress = (batch_idx + 1) / total_batches
            bar_length = 20
            filled_length = int(bar_length * progress)
            bar = "=" * filled_length + " " * (bar_length - filled_length)
            percentage = int(progress * 100)
            sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches}")
            sys.stdout.flush()




    # Compute average scores
    avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
    avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
    avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_bert_precision = sum(bert_precision) / len(bert_precision)
    avg_bert_recall = sum(bert_recall) / len(bert_recall)
    avg_bert_f1 = sum(bert_f1) / len(bert_f1)

    print("\nOverall Scores:")
    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")
    print(f"Average BERT Precision: {avg_bert_precision:.4f}")
    print(f"Average BERT Recall: {avg_bert_recall:.4f}")
    print(f"Average BERT F1: {avg_bert_f1:.4f}")





# Before Robustness
model = FrameCaptioner().to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_FC.pth"))
print("Results of frame captioner using Object detector before robustness: ")
evaluate_model(model, test_loader, device)



# After Robustness
model = FrameCaptioner().to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_FC_robust.pth"))
print("Results of frame captioner using Object detector after robustness: ")
evaluate_model(model, test_loader, device)


  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_FC.pth"))


Results of frame captioner using Object detector before robustness: 
Overall Scores:
Average BLEU: 0.6395
Average ROUGE-1: 0.8351
Average ROUGE-2: 0.7747
Average ROUGE-L: 0.8116
Average BERT Precision: 0.7771
Average BERT Recall: 0.7644
Average BERT F1: 0.7707


  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_FC_robust.pth"))


Results of frame captioner using Object detector after robustness: 
Overall Scores:
Average BLEU: 0.7267
Average ROUGE-1: 0.8700
Average ROUGE-2: 0.8096
Average ROUGE-L: 0.8637
Average BERT Precision: 0.7745
Average BERT Recall: 0.8365
Average BERT F1: 0.8052


# Store Generated captions

Run this section to store the frame captions generated by the robust model

In [None]:
frame_captioner = FrameCaptioner().to(device)
frame_captioner.load_state_dict(torch.load(f"{save_dir}/Models/model_FC_robust.pth"))

  frame_captioner.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_FC_robust.pth"))


<All keys matched successfully>

In [None]:
batch_size = 8
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
with open(f"{save_dir}/Predictions/predicted_objects.json", "r") as f:
        predicted_objects_dataset= json.load(f)

In [None]:
def predict_dataset( FrameCaptioner, loader, device):
    prediction_dataset = {}
    total_batches = len(loader)



    for batch_idx, batch in enumerate(loader):
        with torch.no_grad():
            frame = batch['frame'].to(device)
            objects = batch['objects']
            frame_caption = batch['frame_caption']
            video_folder = batch['video']
            frame_name = batch['frame_number']

            predicted_objects = [predicted_objects_dataset[video_folder[i]][frame_name[i]]["predicted_objects"]for i in range(len(video_folder)) ]
            predicted_caption = FrameCaptioner(frame, predicted_objects)


            for i in range(len(video_folder)):
                vid = video_folder[i]
                frm = frame_name[i]

                if vid not in prediction_dataset:
                    prediction_dataset[vid] = {}

                prediction_dataset[vid][frm] = {
                    "predicted_caption": predicted_caption[i]
                }

            # Print progress bar
            progress = (batch_idx + 1) / total_batches
            bar_length = 20
            filled_length = int(bar_length * progress)
            bar = "=" * filled_length + " " * (bar_length - filled_length)
            percentage = int(progress * 100)
            sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches}")
            sys.stdout.flush()

    return prediction_dataset

prediction_dataset = predict_dataset(frame_captioner, loader, device)

# Save to JSON file
save_path = f"{save_dir}/Predictions/predicted_frames.json"
with open(save_path, "w") as f:
    json.dump(prediction_dataset, f, indent=4)


