In [None]:
# !git clone https://github.com/deepseek-ai/Janus
# !cd Janus

# !pip install torch==2.0.1 transformers>=4.38.2 timm>=0.9.16 accelerate sentencepiece attrdict einops


In [None]:
# !cd Janus/janus

In [None]:
# pip install janus

In [None]:
# pip show janus

In [None]:
# import os
# print(os.getcwd())


In [None]:
import janus

In [None]:
# import sys
# sys.path.append("Janus/janus")

In [None]:
# pip show janus


In [None]:
# pip install Janus/janus


In [None]:

import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

conversation = [
    {
        "role": "User",
        "content": "<image_placeholder>\nConvert the formula into latex code.",
        "images": ["images/equation.png"],
    },
    {"role": "Assistant", "content": ""},
    
]

# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
    conversations=conversation, images=pil_images, force_batchify=True
).to(vl_gpt.device)

# # run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

# # run the model to get the response
outputs = vl_gpt.language_model.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=prepare_inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,
    do_sample=False,
    use_cache=True,
)

answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
print(f"{prepare_inputs['sft_format'][0]}", answer)


# ----------------------------- Count -------------------

## General Inferencing

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print(f"CUDA is available. Device count: {torch.cuda.device_count()}")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
    print(f"Current device index: {torch.cuda.current_device()}")
else:
    print("CUDA is not available.")

# Brightness-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Brightness/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Contrast-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Contrast/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Defocus-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Defocus-blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Elastic-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Elastic/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Fog-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Fog/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Frost-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Frost/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Gaussian-noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Gaussian-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Impulse-Noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Impulse-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# JPEG-compresion-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/JPEG-compression/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Pixelate-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Pixelate/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Rain-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Rain/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Saturation-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Saturation/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Shot-noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Shot-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Snow-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Snow/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Spatter-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Spatter/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Speckle-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Speckle-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Zoom-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Zoom-Blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Motion-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Motion-blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about count. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# ----------------------------- Order -------------------

## General Inferencing

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print(f"CUDA is available. Device count: {torch.cuda.device_count()}")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
    print(f"Current device index: {torch.cuda.current_device()}")
else:
    print("CUDA is not available.")

# Brightness-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Brightness/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Contrast-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Contrast/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Defocus-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Defocus-blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Elastic-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Elastic/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Fog-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Fog/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Frost-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Frost/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Gaussian-noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Gaussian-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Impulse-Noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Impulse-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# JPEG-compresion-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/JPEG-compression/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Pixelate-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Pixelate/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Rain-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Rain/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Saturation-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Saturation/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Shot-noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Shot-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Snow-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Snow/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Spatter-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Spatter/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Speckle-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Speckle-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Zoom-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Zoom-Blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Motion-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "order"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Motion-blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about order. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# ----------------------------- Trick -------------------

## General Inferencing

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print(f"CUDA is available. Device count: {torch.cuda.device_count()}")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
    print(f"Current device index: {torch.cuda.current_device()}")
else:
    print("CUDA is not available.")

# Brightness-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Brightness/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Contrast-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Contrast/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Defocus-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Defocus-blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Elastic-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Elastic/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Fog-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Fog/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Frost-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Frost/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Gaussian-noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Gaussian-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Impulse-Noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Impulse-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# JPEG-compresion-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/JPEG-compression/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Pixelate-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Pixelate/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Rain-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Rain/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Saturation-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Saturation/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Shot-noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Shot-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Snow-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Snow/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Spatter-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Spatter/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Speckle-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Speckle-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Zoom-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Zoom-Blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Motion-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "trick"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Motion-blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about trick. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# ----------------------------- VCR -------------------

## General Inferencing

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print(f"CUDA is available. Device count: {torch.cuda.device_count()}")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
    print(f"Current device index: {torch.cuda.current_device()}")
else:
    print("CUDA is not available.")

# Brightness-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Brightness/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Contrast-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Contrast/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Defocus-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Defocus-blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Elastic-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Elastic/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Fog-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Fog/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Frost-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Frost/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Gaussian-noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Gaussian-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Impulse-Noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Impulse-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# JPEG-compresion-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/JPEG-compression/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Pixelate-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Pixelate/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")


# Rain-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Rain/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Saturation-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Saturation/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Shot-noise-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Shot-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Snow-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Snow/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Spatter-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Spatter/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Speckle-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Speckle-noise/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Zoom-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Zoom-Blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")

# Motion-blur-Count

In [None]:
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import pandas as pd
from collections import defaultdict
from PIL import Image

# specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Load the dataset
data = pd.read_csv("Noisy-Denoised_QuestionPairs[new].csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

# Iterate over rows where category is "vcr"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories

    # Load the corresponding image
    image_path = f"Noisy DARE TEST/Motion-blur/{row['path']}"
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}. Skipping...")
        continue

    # Prepare the conversation
    conversation = [
        {
            "role": "User",
            "content": "<image_placeholder>\nThe following are multiple choice questions about vcr. You should directly answer the question by choosing the correct option given the image and the question. Give only the letter indicating the correct answer e.g. 'A'\n"
            f"Question: {row['denoised_question']}\n"
            "Options:\n"
            f"A. {row['A']}\n"
            f"B. {row['B']}\n"
            f"C. {row['C']}\n"
            f"D. {row['D']}\n"
            "Answer:",
            "images": [image_path],
        },
        {"role": "Assistant", "content": ""},
    ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    predicted_answer = answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    noise_type = row["modified_question_function_name"]

    # Compare with the actual answer and update counters
    if predicted_answer == row["answer"][2]:
        noise_type_accuracy[noise_type]["correct"] += 1
    noise_type_accuracy[noise_type]["total"] += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']} - Noise Type: {noise_type}")


In [None]:
# Initialize counters for overall accuracy
total_correct = 0
total_predictions = 0

# Calculate and print accuracy per noise type
for noise_type, counts in noise_type_accuracy.items():
    correct = counts["correct"]
    total = counts["total"]
    print(f"Noise Type : {noise_type}, Correct Predictions: {correct}, Total: {total}")
    accuracy = (correct / total) * 100 if total > 0 else 0
    print(f"Accuracy for noise type '{noise_type}': {accuracy:.2f}%")
    
    # Update overall counters
    total_correct += correct
    total_predictions += total

# Calculate overall accuracy
print(f"Total Correct: {total_correct}, Total Predictions: {total_predictions}")
overall_accuracy = (total_correct / total_predictions) * 100 if total_predictions > 0 else 0
print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")