In [None]:
# =====================================
# PART 0: SETUP & INSTALL
# =====================================

# =====================================
# PART 1: DATA LOADING
# =====================================
# Import necessary libraries
from google.colab import drive
import os
import json
import cv2
import numpy as np
from matplotlib import pyplot as plt


# === STEP 1: Mount Google Drive and Verify Path ===
from google.colab import drive
import os

drive.mount('/content/drive')

drive_path = '/content/drive/MyDrive/DC_hand_annotated/'

# Check if the drive path exists
if os.path.exists(drive_path):
    print(f"Drive path exists: {drive_path}")
else:
    print("Drive path not found")

# List files in the 'DC_hand_annotated' directory
files = os.listdir(drive_path)
print("Files in DC_hand_annotated:", files)






Mounted at /content/drive
Drive path exists: /content/drive/MyDrive/DC_hand_annotated/
Files in DC_hand_annotated: ['drivingcontexts_questions.json', 'drivingcontexts_annotations.json', 'kitti', 'nuscenes', 'pittsburgh', 'web']


In [None]:
# === STEP 2: Install Dependencies ===
!pip install transformers timm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->timm)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->tim

In [None]:
# === STEP 3: Imports ===
import json
from PIL import Image
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from transformers import ViltProcessor, ViltForQuestionAnswering

In [None]:
# === STEP 4: Dataset Setup ===
class DrivingContextDataset(Dataset):
    def __init__(self, root_folder, image_root, max_samples=10):
        with open(os.path.join(root_folder, 'drivingcontexts_questions.json')) as f:
            data = json.load(f)  # Load the entire JSON data
            self.questions = data['questions']  # Access the 'questions' list

        with open(os.path.join(root_folder, 'drivingcontexts_annotations.json')) as f:
            data = json.load(f)  # Load the entire JSON data
            self.annotations = data['annotations']  # Access the 'annotations' list

        self.image_root = image_root
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

        # Limit to first `max_samples`
        self.questions = self.questions[:max_samples]
        self.annotations = self.annotations[:max_samples]

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        # Assuming the image filename is in 'image_id' instead of 'image'
        image_path = os.path.join(self.image_root, self.questions[idx]['image_id'].split('/')[-1])
        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)

        question = self.questions[idx]['question']
        # Check if 'answer' key exists, otherwise provide a default value
        answer = self.annotations[idx].get('answer', '').lower().strip()

        return {
            'image': image,
            'question': question,
            'answer': answer
        }

In [None]:
# === STEP 5: Load Dataset ===
image_root = os.path.join(drive_path, 'pittsburgh')
dataset = DrivingContextDataset(drive_path, image_root, max_samples=10)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [None]:
# === STEP 6: Load ViLT Model ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to(device)
model.eval()

ViltForQuestionAnswering(
  (vilt): ViltModel(
    (embeddings): ViltEmbeddings(
      (text_embeddings): TextEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(40, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (patch_embeddings): ViltPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
      )
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViltEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViltLayer(
          (attention): ViltAttention(
            (attention): ViltSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=76

In [None]:
# === STEP 7: Run Inference and Evaluate ===
from tqdm import tqdm

correct = 0
total = 0

model.eval()

for batch in tqdm(dataloader):
    # Remove the unsqueeze(0) as it's adding an extra dimension
    image = batch['image'].to(device)
    question = batch['question']
    true_answer = batch['answer']

    # Tokenize and encode
    encoding = processor(images=image, text=question, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits

    predicted_class_idx = logits.argmax(-1).item()
    predicted_answer = model.config.id2label[predicted_class_idx].lower()

    gt_answer = true_answer[0].lower().strip()

    print(f"Q: {question[0]}")
    print(f"Predicted: {predicted_answer} | Ground Truth: {gt_answer}")
    print("-" * 40)

    if predicted_answer == gt_answer:
        correct += 1
    total += 1

  0%|          | 0/10 [00:00<?, ?it/s]It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
 40%|████      | 4/10 [00:00<00:00, 15.54it/s]

Q: Are we driving indoors?
Predicted: no | Ground Truth: 
----------------------------------------
Q: Are we driving outdoors?
Predicted: no | Ground Truth: 
----------------------------------------
Q: Is this during daytime?
Predicted: yes | Ground Truth: 
----------------------------------------
Q: Is this during nighttime?
Predicted: no | Ground Truth: 
----------------------------------------


 80%|████████  | 8/10 [00:00<00:00, 15.95it/s]

Q: Is this during twilight?
Predicted: no | Ground Truth: 
----------------------------------------
Q: Is this during sunny weather?
Predicted: yes | Ground Truth: 
----------------------------------------
Q: Is this during rainy weather?
Predicted: no | Ground Truth: 
----------------------------------------
Q: Is this during snowy weather?
Predicted: yes | Ground Truth: 
----------------------------------------


100%|██████████| 10/10 [00:00<00:00, 15.81it/s]

Q: Is this during foggy weather?
Predicted: no | Ground Truth: 
----------------------------------------
Q: Is this on a highway?
Predicted: no | Ground Truth: 
----------------------------------------





In [None]:
accuracy = correct / total if total > 0 else 0
print(f"\nFinal Accuracy on {total} samples: {accuracy:.4f}")



Final Accuracy on 10 samples: 0.0000


## next

In [None]:
# === STEP 1: Mount Google Drive and Verify Path ===
from google.colab import drive
import os

drive.mount('/content/drive')

drive_path = '/content/drive/MyDrive/DC_hand_annotated/'

Mounted at /content/drive


In [None]:

# === STEP 2: Load the JSON Files ===
import json

# Load questions and annotations
with open(os.path.join(drive_path, 'drivingcontexts_questions.json'), 'r') as f:
    questions_data = json.load(f)

with open(os.path.join(drive_path, 'drivingcontexts_annotations.json'), 'r') as f:
    annotations_data = json.load(f)

In [None]:
# === STEP 3: Prepare Data (Merge Questions and Annotations) ===
# Merging question data with their corresponding answers from annotations

merged_data = []

for q in questions_data['questions']:
    # Find the corresponding annotation for the question
    matching_annotation = next((a for a in annotations_data['annotations'] if a['question_id'] == q['question_id']), None)

    if matching_annotation:
        # Extract the correct answer (for simplicity, let's assume 'yes'/'no' answers)
        answer = matching_annotation['answers'][0]['answer']  # Adjust this if you need to extract a specific answer
        merged_data.append({
            'image_id': q['image_id'],
            'question': q['question'],
            'question_id': q['question_id'],
            'answer': answer
        })

In [None]:
# # === STEP 4: Display Images with Corresponding Questions and Answers ===
# import matplotlib.pyplot as plt
# import matplotlib.image as mpimg

# # Display the first 10 images with their corresponding question and answer
# for i in range(20):
#     image_id = merged_data[i]['image_id']
#     question = merged_data[i]['question']
#     answer = merged_data[i]['answer']

#     # Assuming image_id is the filename or an identifier for the image file
#     # The image_id already includes the path relative to the 'pittsburgh' directory
#     # and has the .jpg extension, so we just need to join it with the drive_path
#     image_path = os.path.join(drive_path, image_id)
#     img = mpimg.imread(image_path)

#     # Plot the image and show question and answer
#     plt.figure(figsize=(10, 6))
#     plt.imshow(img)
#     plt.axis('off')  # Hide axes
#     plt.title(f"Q: {question}\nA: {answer}")
#     plt.show()

In [None]:
# === STEP 4: Calculate Accuracy, Precision, Recall, and F1 Score ===

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate_model(predictions, ground_truth):
    """
    Evaluate the model's predictions against the ground truth using Accuracy, Precision, Recall, and F1 Score.

    Args:
    - predictions: List of predicted answers from the model.
    - ground_truth: List of true answers (ground truth).

    Returns:
    - A dictionary containing accuracy, precision, recall, and f1 score.
    """
    # Accuracy
    accuracy = accuracy_score(ground_truth, predictions)

    # Precision, Recall, and F1 Score
    precision = precision_score(ground_truth, predictions, pos_label='yes', average='binary')
    recall = recall_score(ground_truth, predictions, pos_label='yes', average='binary')
    f1 = f1_score(ground_truth, predictions, pos_label='yes', average='binary')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
# Removed the 'Z' as it was likely a typo and not intended to be there.

In [None]:

ground_truth = [item['answer'] for item in merged_data[:10]]

In [None]:
# === STEP 5: Generate Predictions using Your Model ===
predictions = []
for i in range(10):  # Use the same range as ground_truth (first 10 samples)
    # Get image and question
    image = merged_data[i]['image_id']
    question = merged_data[i]['question']

    # TODO: Preprocess image and question to your ViltForQuestionAnswering model
    # Run model inference to get a prediction
    prediction = 'yes'  # Replace with model output logic

    predictions.append(prediction)

# === STEP 6: Evaluate Model ===
metrics = evaluate_model(predictions, ground_truth)

In [None]:
# === STEP 6: Evaluate Model ===
metrics = evaluate_model(predictions, ground_truth)

# === STEP 7: Output the Results ===
print(f"Accuracy: {metrics['accuracy']*100:.2f}%")
print(f"Precision: {metrics['precision']:.2f}")
print(f"Recall: {metrics['recall']:.2f}")
print(f"F1 Score: {metrics['f1_score']:.2f}")

Accuracy: 40.00%
Precision: 0.40
Recall: 1.00
F1 Score: 0.57


## continual learning


In [None]:
# === STEP A: Setup Environment ===
!pip install transformers datasets torchvision

import torch
from transformers import ViltProcessor, ViltForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset
from PIL import Image

# Load ViLT model and processor
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === STEP B: Continual Learning Training Loop ===

def prepare_batch_data(batch):
    images, questions, answers = [], [], []
    for item in batch:
        image_path = os.path.join(drive_path, item['image_id'])
        image = Image.open(image_path).convert("RGB")
        images.append(image)
        questions.append(item['question'])
        answers.append(item['answer'])
    return processor(images=images, text=questions, return_tensors="pt", padding=True, truncation=True), answers

# Create incremental batches for continual learning
batch_size = 32
all_batches = [merged_data[i:i + batch_size] for i in range(0, len(merged_data), batch_size)]

# Fine-tune incrementally
for i, batch in enumerate(all_batches[:3]):  # You can increase range for more learning
    print(f"\n=== Training on batch {i+1} ===")
    inputs, labels = prepare_batch_data(batch)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    targets = [1 if a == "yes" else 0 for a in labels]  # Binary labels
    target_tensor = torch.tensor(targets).to(device)

    outputs = model(**inputs)
    logits = outputs.logits
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits, target_tensor)
    loss.backward()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    optimizer.step()
    optimizer.zero_grad()
    print(f"Loss: {loss.item():.4f}")

# === STEP C: Predict with Fine-tuned Model ===

# Evaluate on 10 samples
predictions = []
for item in merged_data[:10]:
    image_path = os.path.join(drive_path, item['image_id'])
    image = Image.open(image_path).convert("RGB")
    encoding = processor(image, item['question'], return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**encoding)
        predicted = torch.argmax(outputs.logits, dim=1).item()
        predictions.append("yes" if predicted == 1 else "no")


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


=== Training on batch 1 ===
Loss: 21.7978

=== Training on batch 2 ===
Loss: 17.1745

=== Training on batch 3 ===
Loss: 13.0512


In [None]:
# Evaluate
ground_truth = [item['answer'] for item in merged_data[:10]]
metrics = evaluate_model(predictions, ground_truth)
print(f"Accuracy: {metrics['accuracy']*100:.2f}%")
print(f"Precision: {metrics['precision']:.2f}")
print(f"Recall: {metrics['recall']:.2f}")
print(f"F1 Score: {metrics['f1_score']:.2f}")


Accuracy: 60.00%
Precision: 0.00
Recall: 0.00
F1 Score: 0.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## bayesian uncertainity estimation

In [None]:
import numpy as np

def enable_dropout(model):
    """Enable dropout layers during inference."""
    for m in model.modules():
        if m.__class__.__name__.startswith('Dropout'):
            m.train()

def get_uncertainty_predictions(image, question, runs=10):
    encoding = processor(image, question, return_tensors="pt").to(device)
    all_logits = []

    model.eval()
    enable_dropout(model)  # Enable dropout at inference

    for _ in range(runs):
        with torch.no_grad():
            outputs = model(**encoding)
            logits = torch.nn.functional.softmax(outputs.logits, dim=1)
            all_logits.append(logits.cpu().numpy())

    stacked = np.stack(all_logits)
    mean_logits = stacked.mean(axis=0)
    std_logits = stacked.std(axis=0)

    pred_class = np.argmax(mean_logits)
    uncertainty = std_logits[0][pred_class]
    return "yes" if pred_class == 1 else "no", uncertainty


In [None]:
predictions, uncertainties = [], []
for item in merged_data[:10]:
    image_path = os.path.join(drive_path, item['image_id'])
    image = Image.open(image_path).convert("RGB")
    pred, uncertainty = get_uncertainty_predictions(image, item['question'])
    predictions.append(pred)
    uncertainties.append(uncertainty)

# Print uncertainty results
for i, item in enumerate(merged_data[:10]):
    print(f"Q: {item['question']}")
    print(f"Pred: {predictions[i]}, GT: {item['answer']}, Uncertainty: {uncertainties[i]:.4f}\n")

# Evaluate performance
ground_truth = [item['answer'] for item in merged_data[:10]]
metrics = evaluate_model(predictions, ground_truth)
print(f"\nBayesian Accuracy: {metrics['accuracy']*100:.2f}%")
print(f"Precision: {metrics['precision']:.2f}")
print(f"Recall: {metrics['recall']:.2f}")
print(f"F1 Score: {metrics['f1_score']:.2f}")


Q: Are we driving indoors?
Pred: no, GT: no, Uncertainty: 0.0000

Q: Are we driving outdoors?
Pred: no, GT: yes, Uncertainty: 0.0000

Q: Is this during daytime?
Pred: no, GT: yes, Uncertainty: 0.0000

Q: Is this during nighttime?
Pred: no, GT: no, Uncertainty: 0.0000

Q: Is this during twilight?
Pred: no, GT: no, Uncertainty: 0.0000

Q: Is this during sunny weather?
Pred: no, GT: yes, Uncertainty: 0.0000

Q: Is this during rainy weather?
Pred: no, GT: no, Uncertainty: 0.0000

Q: Is this during snowy weather?
Pred: no, GT: no, Uncertainty: 0.0000

Q: Is this during foggy weather?
Pred: no, GT: no, Uncertainty: 0.0000

Q: Is this on a highway?
Pred: no, GT: yes, Uncertainty: 0.0000


Bayesian Accuracy: 60.00%
Precision: 0.00
Recall: 0.00
F1 Score: 0.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Novelity

In [None]:
# === STEP 1: Sort data for Continual Learning Simulation ===
# Shuffle for randomness
import random
random.shuffle(merged_data)

# Divide into 4 incremental batches (e.g., 25% each)
split_1 = merged_data[:len(merged_data)//4]
split_2 = merged_data[len(merged_data)//4:len(merged_data)//2]
split_3 = merged_data[len(merged_data)//2:3*len(merged_data)//4]
split_4 = merged_data[3*len(merged_data)//4:]

continual_splits = [split_1, split_2, split_3, split_4]


In [None]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch.nn.functional as F
import torch

# Load ViLT processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Put model in train mode to enable dropout during inference (for MC Dropout)
model.train()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


ViltForQuestionAnswering(
  (vilt): ViltModel(
    (embeddings): ViltEmbeddings(
      (text_embeddings): TextEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(40, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (patch_embeddings): ViltPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
      )
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViltEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViltLayer(
          (attention): ViltAttention(
            (attention): ViltSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=76

In [None]:
from PIL import Image
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

results_list = []

def predict_with_uncertainty(image_path, question, n_samples=10):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, question, return_tensors="pt").to(device)

    outputs = []
    for _ in range(n_samples):
        with torch.no_grad():
            output = model(**inputs).logits
            outputs.append(F.softmax(output, dim=1).cpu().numpy())
    probs = np.array(outputs)
    mean_probs = probs.mean(axis=0)[0]
    std_dev = probs.std(axis=0)[0]
    return mean_probs, std_dev

def evaluate_batch(batch, batch_id):
    y_true, y_pred, uncertainties = [], [], []
    records = []

    for item in batch:
        image_path = os.path.join(drive_path, item['image_id'])
        mean_probs, std = predict_with_uncertainty(image_path, item['question'])
        pred_label = model.config.id2label[np.argmax(mean_probs)]
        true_label = item['answer']

        y_pred.append(pred_label)
        y_true.append(true_label)
        uncertainties.append(np.mean(std))  # average uncertainty across 2 classes

        records.append({
            'image_id': item['image_id'],
            'question': item['question'],
            'ground_truth': true_label,
            'prediction': pred_label,
            'uncertainty': np.mean(std)
        })

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label='yes')
    recall = recall_score(y_true, y_pred, pos_label='yes')
    f1 = f1_score(y_true, y_pred, pos_label='yes')

    print(f"\n=== Batch {batch_id} Evaluation ===")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    results_list.append({
        'batch': batch_id,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'avg_uncertainty': np.mean(uncertainties)
    })

    return pd.DataFrame(records)


In [None]:
all_records_df = pd.DataFrame()

for i, batch in enumerate(continual_splits):
    batch_df = evaluate_batch(batch, batch_id=i+1)
    all_records_df = pd.concat([all_records_df, batch_df], ignore_index=True)

# Save predictions + uncertainty
all_records_df.to_csv(os.path.join(drive_path, 'predictions_with_uncertainty.csv'), index=False)

# Save metrics per batch
metrics_df = pd.DataFrame(results_list)
metrics_df.to_csv(os.path.join(drive_path, 'batchwise_metrics.csv'), index=False)



=== Batch 1 Evaluation ===
Accuracy: 0.90
Precision: 0.78
Recall: 0.86
F1 Score: 0.82
