In [None]:
import os
os._exit(00)

# Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install rouge_score
!pip install bert_score
!pip install natsort

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [3]:
import os
import sys
import json
import pickle
import re
import math
import numpy as np
import cv2
import matplotlib.pyplot as plt

from collections import defaultdict
from natsort import natsorted
from sklearn.model_selection import train_test_split

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from datasets import Dataset

import torch
import torch.nn as nn
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
from torch.optim.lr_scheduler import StepLR

import torchvision.models as models
from torchvision import transforms

from transformers import (
    ViTModel,
    VivitModel,
    AutoModel,
    AutoTokenizer,
    T5ForConditionalGeneration,
    DetrForObjectDetection,
    DetrImageProcessor,
    AdamW,
    get_linear_schedule_with_warmup,
)
from transformers.modeling_outputs import BaseModelOutput
from transformers import logging



In [4]:
# Update with your own save path
save_dir = "/content/drive/My Drive/Master Thesis/CholecT50"

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load Datasets and create dataloader

Because of the limited capacity of the GPU, the datasets are loaded and used at 2 separate times. First the first 30 videos, then the last 20.

In [6]:
# use for i in range(3) to load the first 30 videos
# use for i in range(3,5) to load the last 20 videos

dataset = []
for i in range(3):
  d =  torch.load(f"{save_dir}/Datasets/clip_dataset_{int(i*10)}_{int(i*10+9)}.pt")
  print(i)
  dataset.extend(d)


0
1
2


In [7]:
print(dataset[0].keys())
print(len(dataset))
print(type(dataset[0]["frame_captions"]), type(dataset[0]["frame_captions"][0]), len(dataset[0]["frame_captions"]))
print(type(dataset[0]["clip"]), dataset[0]["clip"].shape)
print(type(dataset[0]["clip_caption"]), dataset[0]["clip_caption"])

dict_keys(['video', 'frame_numbers', 'clip', 'frame_captions', 'clip_caption'])
3873
<class 'list'> <class 'str'> 32
<class 'torch.Tensor'> torch.Size([32, 3, 224, 224])
<class 'str'> First, during the phase of preparation lasting 22 seconds, the grasper is grasping the gallbladder while the hook is present. Then, during the phase of carlot-triangle-dissection lasting 10 seconds, the grasper is grasping the gallbladder while the hook is present.


In [8]:
# Custom collate function
def collate_fn(batch):
    return {
        'video': torch.stack([item['clip'] for item in batch]),
        'video_folder': [item['video'] for item in batch],
        'frame_numbers': [item['frame_numbers'] for item in batch],
        'frame_captions': [item['frame_captions'] for item in batch],
        'clip_caption': [item['clip_caption'] for item in batch]
    }

In [9]:
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

# Create DataLoaders
batch_size = 1
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)


# First Model

Since the dataset need to be loaded separately, the training is done by saving the model and optimizer.

## model

In [11]:
class SurgicalVideoCaptioner(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.device = torch.device(device)

        # Video encoder (ViViT)
        self.vivit = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400").to(self.device)
        self.video_proj = nn.Linear(768, 768).to(self.device)

        # Text encoder (DistilBERT)
        self.text_encoder = AutoModel.from_pretrained("distilbert-base-uncased").to(self.device)
        self.text_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        self.text_proj = nn.Linear(768, 768).to(self.device)

        # T5 decoder
        self.decoder = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(self.device)
        self.decoder_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")


    def forward(self, video, frame_captions, clip_caption=None, max_tokens=128):
        batch_size = video.size(0)

        # video
        video = video.to(self.device)
        video_features = self.vivit(video).last_hidden_state
        video_features = self.video_proj(video_features)

        # text
        flattened_captions = [cap for caps in frame_captions for cap in caps]
        text_inputs = self.text_tokenizer(
            flattened_captions,
            padding="max_length",
            truncation=True,
            max_length=max_tokens,
            return_tensors="pt",
        ).to(self.device)
        text_features = self.text_encoder(**text_inputs).last_hidden_state
        text_features = text_features.view(batch_size, -1, 768)
        text_features = self.text_proj(text_features)

        # fuse
        fused_features = torch.cat((video_features, text_features), dim=1)

        # training
        if clip_caption is not None:
            labels = self.decoder_tokenizer(
                clip_caption,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=128
            ).input_ids.to(self.device)

            outputs = self.decoder(
                encoder_outputs=(fused_features,),
                labels=labels
            )

            return outputs
        # generation
        else:
            input_ids = torch.ones(batch_size, 1).fill_(self.decoder_tokenizer.pad_token_id).to(self.device)
            output = self.decoder.generate(
                input_ids=input_ids,
                encoder_outputs=BaseModelOutput(last_hidden_state=fused_features),
                max_length=max_tokens,
                temperature=0.2,
                top_k=10,
                top_p=0.7,
                do_sample=True,
                no_repeat_ngram_size=2
            )
            generated_captions = [self.decoder_tokenizer.decode(out, skip_special_tokens=True) for out in output]
            return generated_captions




## Train

In [None]:


def run_epoch(model, loader, optimizer=None, scheduler=None, train=True, temperature=1.0):
    mode = "Training" if train else "Validation"
    model.train() if train else model.eval()
    total_loss = 0
    total_batches = len(loader)

    for batch_idx, batch in enumerate(loader):
        videos = batch['video'].to(device)
        targets = batch['clip_caption']
        frame_captions = batch['frame_captions']

        if targets[0] == "":
            continue

        if train:
            optimizer.zero_grad()
            outputs = model(videos, frame_captions, targets)
            loss = outputs.loss / temperature
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if scheduler:
                scheduler.step()
        else:
            with torch.no_grad():
                outputs = model(videos, frame_captions, targets)
                loss = outputs.loss / temperature

        total_loss += loss.item()

        # Print progress bar
        progress = (batch_idx + 1) / total_batches
        bar_length = 20
        filled_length = int(bar_length * progress)
        bar = "=" * filled_length + " " * (bar_length - filled_length)
        percentage = int(progress * 100)
        sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches} - {mode} Loss: {loss.item():.4f} - Avg {mode} Loss: {total_loss / (batch_idx + 1):.4f}")
        sys.stdout.flush()

    print()
    return total_loss / len(loader)

# Initialize
model = SurgicalVideoCaptioner(device).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

#model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC.pth"))
#checkpoint = torch.load(f"{save_dir}/Models/optimizer.pth")
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

num_epochs = 10
temperature = 2.0
num_training_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# training
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}: ")
    loss = run_epoch(model, train_loader, optimizer, scheduler, train=True, temperature=temperature)
    val_loss = run_epoch(model, val_loader, train=False, temperature=temperature)
    torch.save(model.state_dict(), f"{save_dir}/Models/model_CC.pth")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_model_CC_10.pth"))


Epoch 1/10: 


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 2/10: 
Epoch 3/10: 
Epoch 4/10: 
Epoch 5/10: 
Epoch 6/10: 
Epoch 7/10: 
Epoch 8/10: 
Epoch 9/10: 
Epoch 10/10: 


## Example

In [None]:
model = SurgicalVideoCaptioner(device).to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC.pth"))
model.eval()

In [None]:
num_examples = 10

test_iter = iter(test_loader)
examples = [next(test_iter) for _ in range(num_examples)]

for i, batch in enumerate(examples):
    videos = batch['video'].to(device)
    frame_captions = batch['frame_captions']
    targets = batch['clip_caption']

    with torch.no_grad():
        pred_caption = model(videos, frame_captions)[0]
    target_caption = targets[0]

    print(f"Example {i+1}:")
    print("Target Caption: ", target_caption)
    print("Predicted Caption: ", pred_caption)
    print("-" * 50)


Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(f"/content/drive/My Drive/Master Thesis/CholecT50/final_model_CC_{20}.pth"))


Example 1:
Target Caption:  During the phase of carlot-triangle-dissection lasting 32 seconds, the grasper is retracting the gallbladder while the hook is dissecting the omentum.
Predicted Caption:  During the phase of carlot-triangle-dissection lasting 32 seconds, the grasper is retracting the gallbladder while the hook is dissecting the omentum.
--------------------------------------------------
Example 2:
Target Caption:  During the phase of gallbladder-dissection lasting 32 seconds, the hook is dissecting the gallbladder.
Predicted Caption:  During the phase of gallbladder-dissection lasting 32 seconds, the hook is dissecting the gallbleddity.
--------------------------------------------------
Example 3:
Target Caption:  During the phase of preparation lasting 32 seconds, the grasper is present, the grasper is retracting the gut, the grasper is retracting the liver and the grasper is retracting the gallbladder.
Predicted Caption:  During the phase of preparation lasting 32 seconds,

## Test

In [None]:
def evaluate_model(model, test_loader, device):
    model.eval()
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    bleu_scores = []
    total_batches = len(test_loader)
    bert_precision, bert_recall, bert_f1 = [], [], []


    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            videos = batch['video'].to(device)
            frame_captions = batch['frame_captions']
            targets = batch['clip_caption'][0]

            with torch.no_grad():
                generated_caption = model(videos, frame_captions)[0]

            if targets == "" or generated_caption == "":
                continue

            # Compute ROUGE scores
            scores = rouge.score(generated_caption, targets)
            for key in rouge_scores:
                rouge_scores[key].append(scores[key].fmeasure)

            # Compute BLEU score
            reference = [targets.split()]
            hypothesis = generated_caption.split()
            bleu = sentence_bleu(reference, hypothesis, smoothing_function=smooth_fn)
            bleu_scores.append(bleu)

            # Compute BERTScore
            logging.set_verbosity_error()
            P, R, F1 = bert_score([generated_caption], [targets], lang="en", rescale_with_baseline=True)
            bert_precision.append(P.item())
            bert_recall.append(R.item())
            bert_f1.append(F1.item())

            # progress bar
            progress = (batch_idx + 1) / total_batches
            bar_length = 20
            filled_length = int(bar_length * progress)
            bar = "=" * filled_length + " " * (bar_length - filled_length)
            percentage = int(progress * 100)
            sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches}")
            sys.stdout.flush()

    # Compute average scores
    avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
    avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
    avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_bert_precision = sum(bert_precision) / len(bert_precision)
    avg_bert_recall = sum(bert_recall) / len(bert_recall)
    avg_bert_f1 = sum(bert_f1) / len(bert_f1)

    print("\nOverall Scores:")
    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")
    print(f"Average BERT Precision: {avg_bert_precision:.4f}")
    print(f"Average BERT Recall: {avg_bert_recall:.4f}")
    print(f"Average BERT F1: {avg_bert_f1:.4f}")

    return {
        "BLEU": avg_bleu,
        "ROUGE-1": avg_rouge1,
        "ROUGE-2": avg_rouge2,
        "ROUGE-L": avg_rougeL,
        "BERT Precision": avg_bert_precision,
        "BERT Recall": avg_bert_recall,
        "BERT F1": avg_bert_f1
    }



# Inital results
model = SurgicalVideoCaptioner(device).to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC.pth"))
print("Results using ground-truth frame captions: ")
scores = evaluate_model(model, test_loader, device)


  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_CC.pth"))


Results using ground-truth frame captions: 
Overall Scores:
Average BLEU: 0.6490
Average ROUGE-1: 0.8615
Average ROUGE-2: 0.7975
Average ROUGE-L: 0.7968
Average BERT Precision: 0.7733
Average BERT Recall: 0.7696
Average BERT F1: 0.7714


# Train with real data

In [13]:
with open(f"{save_dir}/Predictions/predicted_frames.json", "r") as f:
    prediction_frame_dataset = json.load(f)

In [None]:
def run_epoch(model, loader, optimizer=None, scheduler=None, train=True, temperature=1.0):
    mode = "Training" if train else "Validation"
    model.train() if train else model.eval()
    total_loss = 0
    total_batches = len(loader)

    for batch_idx, batch in enumerate(loader):
        videos = batch['video'].to(device)
        targets = batch['clip_caption']
        video_folder = batch['video_folder']
        frame_names = batch['frame_numbers']



        default_caption = ""
        predicted_captions = [
            list(map(
                lambda frame: prediction_frame_dataset.get(video, {}).get(frame, {}).get("predicted_caption", default_caption),
                frames
            )) for video, frames in zip(video_folder, frame_names)
        ]

        if targets[0] == "":
            continue

        if train:
            optimizer.zero_grad()
            outputs = model(videos, predicted_captions, targets)
            loss = outputs.loss / temperature
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if scheduler:
                scheduler.step()
        else:
            with torch.no_grad():
                outputs = model(videos, predicted_captions, targets)
                loss = outputs.loss / temperature

        total_loss += loss.item()

        # Print progress bar
        progress = (batch_idx + 1) / total_batches
        bar_length = 20
        filled_length = int(bar_length * progress)
        bar = "=" * filled_length + " " * (bar_length - filled_length)
        percentage = int(progress * 100)
        sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches} - {mode} Loss: {loss.item():.4f} - Avg {mode} Loss: {total_loss / (batch_idx + 1):.4f}")
        sys.stdout.flush()

    print()
    return total_loss / len(loader)

################################################################################


# Initialize model and optimizer
model = SurgicalVideoCaptioner(device).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

#Load if necessary
model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC.pth"))
#checkpoint = torch.load("{save_dir}/Models/optimizer.pth")
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

temperature = 2.0
num_epochs = 15
num_training_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}: ")
    loss = run_epoch(model, train_loader, optimizer, scheduler, train=True, temperature=temperature)
    val_loss = run_epoch(model, val_loader, train=False, temperature=temperature)

    # save
    torch.save(model.state_dict(), f"{save_dir}/Models/mode_CC_robust.pth")
    torch.save({'optimizer_state_dict': optimizer.state_dict() }, f"{save_dir}/Models/optimizer.pth")

Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_CC.pth"))


Epoch 1/15: 


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 2/15: 
Epoch 3/15: 
Epoch 4/15: 
Epoch 5/15: 
Epoch 6/15: 
Epoch 7/15: 
Epoch 8/15: 
Epoch 9/15: 
Epoch 10/15: 
Epoch 11/15: 
Epoch 12/15: 
Epoch 13/15: 
Epoch 14/15: 
Epoch 15/15: 


# Simple Model

In [10]:
class Simple_Model(nn.Module):
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        super().__init__()
        self.device = torch.device(device)

        # Video encoder (ViViT)
        self.vivit = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400").to(self.device)

        # Projection layers
        self.video_proj = nn.Linear(768, 768).to(self.device)

        # T5 decoder
        self.decoder = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(self.device)
        self.decoder_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

    def forward(self, video, clip_caption=None, max_tokens=128):
        batch_size = video.size(0)

        video = video.to(self.device)
        video_features = self.vivit(video).last_hidden_state
        video_features = self.video_proj(video_features)

        if clip_caption is not None:
            labels = self.decoder_tokenizer(
                clip_caption,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=128
            ).input_ids.to(self.device)

            outputs = self.decoder(
                encoder_outputs=(video_features,),
                labels=labels
            )

            return outputs

        else:
            input_ids = torch.ones(batch_size, 1).fill_(self.decoder_tokenizer.pad_token_id).to(self.device)

            output = self.decoder.generate(
                input_ids=input_ids,
                encoder_outputs=BaseModelOutput(last_hidden_state=video_features),
                max_length=max_tokens,
                temperature=0.2,
                top_k=10,
                top_p=0.7,
                do_sample=True,
                no_repeat_ngram_size=2
            )
            generated_caption = self.decoder_tokenizer.decode(output[0], skip_special_tokens=True)

            return generated_caption


In [11]:
def run_epoch(model, loader, optimizer=None, scheduler=None, train=True, temperature=1.0):
    mode = "Training" if train else "Validation"
    model.train() if train else model.eval()
    total_loss = 0
    total_batches = len(loader)

    for batch_idx, batch in enumerate(loader):
        videos = batch['video'].to(device)
        targets = batch['clip_caption']


        if targets[0] == "":
            continue

        if train:
            optimizer.zero_grad()
            outputs = model(videos, targets)
            loss = outputs.loss / temperature
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if scheduler:
                scheduler.step()
        else:
            with torch.no_grad():
                outputs = model(videos, targets)
                loss = outputs.loss / temperature

        total_loss += loss.item()

        # Print progress bar
        progress = (batch_idx + 1) / total_batches
        bar_length = 20
        filled_length = int(bar_length * progress)
        bar = "=" * filled_length + " " * (bar_length - filled_length)
        percentage = int(progress * 100)
        sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches} - {mode} Loss: {loss.item():.4f} - Avg {mode} Loss: {total_loss / (batch_idx + 1):.4f}")
        sys.stdout.flush()

    print()
    return total_loss / len(loader)

# Initialize model and optimizer
model = Simple_Model(device).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
#model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC_simple.pth"))
#checkpoint = torch.load("/content/drive/My Drive/Master Thesis/CholecT50/optimizer.pth")
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

num_epochs = 15
num_training_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
temperature = 2.0

# training
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}: ")
    loss = run_epoch(model, train_loader, optimizer, scheduler, train=True, temperature=temperature)
    val_loss = run_epoch(model, val_loader, train=False, temperature=temperature)
    torch.save(model.state_dict(), f"{save_dir}/Models/model_CC_simple.pth")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/18.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/356M [00:00<?, ?B/s]

Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/356M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



Epoch 1/15: 


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 2/15: 
Epoch 3/15: 
Epoch 4/15: 
Epoch 5/15: 
Epoch 6/15: 
Epoch 7/15: 
Epoch 8/15: 
Epoch 9/15: 
Epoch 10/15: 
Epoch 11/15: 
Epoch 12/15: 
Epoch 13/15: 
Epoch 14/15: 
Epoch 15/15: 


# Compare dumb model, Model and Robust Model

In [None]:


def evaluate_model(model, loader, device, simple=False):
    model.eval()

    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1
    total_batches = len(loader)

    # Metrics storage
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    bleu_scores = []
    bert_precision, bert_recall, bert_f1 = [], [], []

    with torch.no_grad():
        for batch_idx, batch in enumerate(loader):
            videos = batch['video'].to(device)
            targets = batch['clip_caption'][0]
            video_folder = batch['video_folder']
            frame_names = batch['frame_numbers']

            default_caption = ""
            predicted_captions = [
                list(map(
                    lambda frame: prediction_frame_dataset.get(video, {}).get(frame, {}).get("predicted_caption", default_caption),
                    frames
                )) for video, frames in zip(video_folder, frame_names)
            ]

            with torch.no_grad():
                if simple:
                    predicted_caption = model(videos)
                else:
                    predicted_caption = model(videos, predicted_captions)[0]



            if predicted_caption != default_caption and targets != default_caption:

                # Compute ROUGE scores
                scores = rouge.score(predicted_caption, targets)
                for key in rouge_scores:
                    rouge_scores[key].append(scores[key].fmeasure)

                # Compute BLEU score
                reference = [targets.split()]
                hypothesis = predicted_caption.split()
                bleu = sentence_bleu(reference, hypothesis, smoothing_function=smooth_fn)
                bleu_scores.append(bleu)

                # Compute BERTScore
                logging.set_verbosity_error()
                P, R, F1 = bert_score([predicted_caption], [targets], lang="en", rescale_with_baseline=True)
                bert_precision.append(P.item())
                bert_recall.append(R.item())
                bert_f1.append(F1.item())

            # Print progress bar
            progress = (batch_idx + 1) / total_batches
            bar_length = 20
            filled_length = int(bar_length * progress)
            bar = "=" * filled_length + " " * (bar_length - filled_length)
            percentage = int(progress * 100)
            sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches}")
            sys.stdout.flush()

    # Compute average scores
    avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
    avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
    avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_bert_precision = sum(bert_precision) / len(bert_precision)
    avg_bert_recall = sum(bert_recall) / len(bert_recall)
    avg_bert_f1 = sum(bert_f1) / len(bert_f1)

    # Print overall results
    print("\nOverall Scores:")
    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")
    print(f"Average BERT Precision: {avg_bert_precision:.4f}")
    print(f"Average BERT Recall: {avg_bert_recall:.4f}")
    print(f"Average BERT F1: {avg_bert_f1:.4f}")

    return {
        "BLEU": avg_bleu,
        "ROUGE-1": avg_rouge1,
        "ROUGE-2": avg_rouge2,
        "ROUGE-L": avg_rougeL,
        "BERT Precision": avg_bert_precision,
        "BERT Recall": avg_bert_recall,
        "BERT F1": avg_bert_f1
    }



#Simple Model
model = Simple_Model().to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC_simple.pth"))
print("Results of simple model: ")
scores_1 = evaluate_model(model, test_loader, device, True)


# Before Robustness
model = SurgicalVideoCaptioner(device).to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC.pth"))
print("Results before robustness: ")
scores_2 = evaluate_model(model, test_loader, device)


# After Robustness
model = SurgicalVideoCaptioner(device).to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC_robust.pth"))
print("Results after robustness: ")
scores_3 = evaluate_model(model, test_loader, device)


Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_simple_model_2.pth"))


Results of simple model: 


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Overall Scores:
Average BLEU: 0.5138
Average ROUGE-1: 0.7591
Average ROUGE-2: 0.6744
Average ROUGE-L: 0.7137
Average BERT Precision: 0.6666
Average BERT Recall: 0.6590
Average BERT F1: 0.6623


  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_CC.pth"))


Results before robustness: 
Overall Scores:
Average BLEU: 0.6023
Average ROUGE-1: 0.8317
Average ROUGE-2: 0.7447
Average ROUGE-L: 0.7605
Average BERT Precision: 0.6705
Average BERT Recall: 0.7486
Average BERT F1: 0.7090


  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_CC_robust.pth"))


Results after robustness: 
Overall Scores:
Average BLEU: 0.6715
Average ROUGE-1: 0.8672
Average ROUGE-2: 0.7991
Average ROUGE-L: 0.8318
Average BERT Precision: 0.7443
Average BERT Recall: 0.7772
Average BERT F1: 0.7607


# Store generated Clip Captions

In [None]:
batch_size = 1
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
model = SurgicalVideoCaptioner(device).to(device)
model.load_state_dict(torch.load(f"{save_dir}/Models/model_CC_robust.pth"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("/content/drive/My Drive/Master Thesis/CholecT50/final_CC_robust_2.pth"))


<All keys matched successfully>

In [None]:
with open(f"{save_dir}/Predictions/predicted_frames.json", "r") as f:
    prediction_frame_dataset = json.load(f)

In [None]:
def predict_dataset(model, loader, device):
    prediction_dataset = {}
    total_batches = len(loader)

    for batch_idx, batch in enumerate(loader):
        with torch.no_grad():

            videos = batch['video'].to(device)
            video_folder = batch['video_folder']
            frame_names = batch['frame_numbers']
            targets = batch['clip_caption']

            default_caption = ""
            predicted_captions = [
                  list(map(
                            lambda frame: prediction_frame_dataset.get(video, {}).get(frame, {}).get("predicted_caption", default_caption),
                            frames
                        )) for video, frames in zip(video_folder, frame_names)
            ]

            predicted_caption = model(videos, predicted_captions)[0]

            # Store results in JSON
            start_frame = frame_names[0][0]
            end_frame = frame_names[0][-1]
            if video_folder[0] not in prediction_dataset:
                    prediction_dataset[video_folder[0]] = []

            prediction_dataset[video_folder[0]].append({
                    "start_frame": start_frame,
                    "end_frame": end_frame,
                    "predicted_caption": predicted_caption
            })


            # Print progress bar
            progress = (batch_idx + 1) / total_batches
            bar_length = 20
            filled_length = int(bar_length * progress)
            bar = "=" * filled_length + " " * (bar_length - filled_length)
            percentage = int(progress * 100)
            sys.stdout.write(f"\r[{bar}] {percentage}% - Batch {batch_idx+1}/{total_batches}")
            sys.stdout.flush()

    return prediction_dataset

prediction_dataset = predict_dataset(model, loader, device)

# Save to JSON file predicted_clips1 for the first 30 videos and predicted_clips2 for the last 20

save_path = f"{save_dir}/Predictions/predicted_clips1.json"
#save_path = f"{save_dir}/Predictions/predicted_clips2.json"

with open(save_path, "w") as f:
    json.dump(prediction_dataset, f, indent=4)



Merge predicted_clips1 and predicted_clips2 into one json file containing the descriptions of the complete dataset

In [None]:

# Load JSON files
with open(f"{save_dir}/Predictions/predicted_clips1.json", "r") as f1, open(f"{save_dir}/Prediction/predicted_clips2.json", "r") as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

# Merge both JSONs
def merge_json(dict1, dict2):
    merged = dict1.copy()  # Start with the first JSON

    for video_id, frames in dict2.items():
        if video_id in merged:
            # Merge frames for existing video
            merged[video_id].update(frames)
        else:
            # Add new video
            merged[video_id] = frames

    return merged

merged_json = merge_json(data1, data2)

# Save merged JSON
with open(f"{save_dir}/Prediction/predicted_clips.json", "w") as f:
    json.dump(merged_json, f, indent=4)
