In [1]:
import os
import csv
import random
import numpy as np
import torch
from torch.utils.data import DataLoader
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from Bench.dataset.multi_dataset import CapDataset
import evaluate

# Load evaluation metrics
accuracy = evaluate.load("accuracy")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")


Using the latest cached version of the module from /Users/rohith/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Mon Jan 27 18:01:49 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/rohith/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Mon Jan 27 18:02:00 2025) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/rohith/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bertscore/cf4907b18f8f741f202232c0f8009a3bd49ff98802c245abcb6ea51a37a8c05b (last modified on Mon Jan 27 18:03:05 2025) since it couldn't be found locally at evaluate-

In [2]:
def seed_everything(seed):
    """Sets random seeds for reproducibility."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)


In [3]:
def parse_args(args=None):
    """Parses command-line arguments, preventing SystemExit in Jupyter."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name_or_path', type=str, default="GoodBaiBai88/M3D-LaMed-Phi-3-4B")
    parser.add_argument('--max_length', type=int, default=512)
    parser.add_argument('--max_new_tokens', type=int, default=256)
    parser.add_argument('--do_sample', type=bool, default=False)
    parser.add_argument('--top_p', type=float, default=None)
    parser.add_argument('--temperature', type=float, default=1.0)
    parser.add_argument('--device', type=str, default="cpu", choices=["cuda", "cpu"])
    
    # Data paths
    parser.add_argument('--data_root', type=str, default="./")
    parser.add_argument('--cap_data_path', type=str, default="dataset.json")
    parser.add_argument('--output_dir', type=str, default="./LaMed/output/LaMed-finetune-0000/eval_caption/")

    parser.add_argument('--proj_out_num', type=int, default=256)

    return parser.parse_args(args if args is not None else [])


In [4]:
def postprocess_text(preds, labels):
    """Strips whitespace from predictions and labels."""
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


In [5]:
seed_everything(42)
args = parse_args([])  # Prevents SystemExit in Jupyter
device = torch.device(args.device)


In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path,
    model_max_length=args.max_length,
    padding_side="right",
    use_fast=False,
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    args.model_name_or_path,
    device_map='cpu',
    trust_remote_code=True
).to(device)


build_sam_vit_3d...




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
test_dataset = CapDataset(args, tokenizer=tokenizer, mode='train')  

test_dataloader = DataLoader(
    test_dataset,
    batch_size=1,
    num_workers=8,  # Reduced from 32 to avoid potential issues
    pin_memory=True,
    shuffle=False,
    drop_last=False,
)


In [8]:
os.makedirs(args.output_dir, exist_ok=True)
output_path = os.path.join(args.output_dir, "eval_caption.csv")

with open(output_path, mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["Question", "Ground Truth", "Pred", "BLEU", "ROUGE-1", "METEOR", "BERT-F1"])


In [9]:
for idx, sample in enumerate(tqdm(test_dataloader, desc="Processing")):
    if sample is None:
        print(f"[Warning] Skipping invalid sample at index {idx} (NoneType).")
        continue  # Skip bad samples


Processing: 100%|██████████| 675/675 [00:58<00:00, 11.49it/s] 


In [10]:
question = sample["question"]
answer = sample["answer"]

input_id = tokenizer(question, return_tensors="pt")['input_ids'].to(device)
image = sample["image"].to(device)


In [11]:
generation = model.generate(
    image, input_id,
    max_new_tokens=args.max_new_tokens,
    do_sample=args.do_sample,
    top_p=args.top_p,
    temperature=args.temperature
)
generated_texts = tokenizer.batch_decode(generation, skip_special_tokens=True)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


KeyboardInterrupt: 

In [None]:
decoded_preds, decoded_labels = postprocess_text(generated_texts, answer)

bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels, max_order=1)['bleu']
rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=['rouge1'])['rouge1']
meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)['meteor']
bert_score = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
bert_f1 = sum(bert_score['f1']) / len(bert_score['f1'])


In [None]:
writer.writerow([question[0], answer[0], generated_texts[0], bleu_score, rouge_score, meteor_score, bert_f1])

