# Count

In [None]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from collections import defaultdict
import pandas as pd
import re

# Load the model and processor
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b", do_image_splitting=False)
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
).to("cuda:0")

# Load the dataset
data = pd.read_csv("[without images]1_correct_validation.csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

#Get the correct count
correct_cnt = 0
total_questions = 0    


# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "count":
        continue  # Skip non-count categories
    total_questions += 1
    # Load the corresponding image
    image_path = f"1_correct_validation_images/{row['path']}"
    
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found for Question ID {row['id']} at path: {image_path}. Skipping...")
        continue
    except Exception as e:
        print(f"Error loading image for Question ID {row['id']} at path: {image_path}. Error: {e}. Skipping...")
        continue

    # Prepare the prompt template
    question_type = row["category"]
    question = row["question"]
    options = {
        "A": row["A"],
        "B": row["B"],
        "C": row["C"],
        "D": row["D"]
    }

    prompt_text = (
        f"The following are multiple choice questions about {question_type}. "
        "You should directly answer the question by choosing the correct option given the image and the question. "
        "Give only the letter indicating the correct answer e.g. 'A'\n"
        f"Question: {question}\n"
        "Options:\n"
        f"A. {options['A']}\n"
        f"B. {options['B']}\n"
        f"C. {options['C']}\n"
        f"D. {options['D']}\n"
        "Answer:"
    )

    # Define conversation for the processor
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt_text},
            ],
        },
    ]
    
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    # Prepare inputs
    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # Generate model output
    output = model.generate(**inputs, max_new_tokens=1000)
    full_answer = processor.batch_decode(output, skip_special_tokens=True)[0]
    
    # print(f"Full model output for Question ID {row['id']}: {full_answer}")
    # print(f"Full model output for Question ID: {full_answer}")
    
    # # Extract the predicted answer
    # if "Assistant:" in full_answer:
    #     predicted_answer = full_answer.split("Assistant:")[-1].strip().split()[0]  # String-based extraction
    # else:
    #     print(f"Could not extract answer for Question ID {row['id']}: {full_answer}")
    #     predicted_answer = "Unknown"
    
    # Extract the answer from the full output
    if match := re.search(r"Assistant:\s*([A-D])", full_answer):
        predicted_answer = match.group(1)  # Extract the letter (A, B, C, or D)
    else:
        print(f"Could not extract answer for Question ID {row['id']}: {full_answer}")
        predicted_answer = "Unknown"
    
    predicted_answer = [predicted_answer]

    # Extract only the final answer (the option letter)
    # predicted_answer = full_answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    # noise_type = row["modified_question_function_name"]
    actual_answer = [row["answer"][2]]

    # Compare with the actual answer and update counters
    if predicted_answer == actual_answer:
        correct_cnt += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']}")

# Calculate Accuracy
print(f"Total Correct:  {correct_cnt}")
print(f"Total Questions:  {total_questions}")
print(f"Accuracy: {(correct_cnt/total_questions) * 100} % ")

In [None]:
# Load the dataset
data = pd.read_csv("[without images]1_correct_validation.csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

#Get the correct count
correct_cnt = 0
total_questions = 0    


# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "order":
        continue  # Skip non-count categories
    total_questions += 1
    # Load the corresponding image
    image_path = f"1_correct_validation_images/{row['path']}"
    
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found for Question ID {row['id']} at path: {image_path}. Skipping...")
        continue
    except Exception as e:
        print(f"Error loading image for Question ID {row['id']} at path: {image_path}. Error: {e}. Skipping...")
        continue

    # Prepare the prompt template
    question_type = row["category"]
    question = row["question"]
    options = {
        "A": row["A"],
        "B": row["B"],
        "C": row["C"],
        "D": row["D"]
    }

    prompt_text = (
        f"The following are multiple choice questions about {question_type}. "
        "You should directly answer the question by choosing the correct option given the image and the question. "
        "Give only the letter indicating the correct answer e.g. 'A'\n"
        f"Question: {question}\n"
        "Options:\n"
        f"A. {options['A']}\n"
        f"B. {options['B']}\n"
        f"C. {options['C']}\n"
        f"D. {options['D']}\n"
        "Answer:"
    )

    # Define conversation for the processor
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt_text},
            ],
        },
    ]
    
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    # Prepare inputs
    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # Generate model output
    output = model.generate(**inputs, max_new_tokens=1000)
    full_answer = processor.batch_decode(output, skip_special_tokens=True)[0]
    
    # print(f"Full model output for Question ID {row['id']}: {full_answer}")
    # print(f"Full model output for Question ID: {full_answer}")
    
    # # Extract the predicted answer
    # if "Assistant:" in full_answer:
    #     predicted_answer = full_answer.split("Assistant:")[-1].strip().split()[0]  # String-based extraction
    # else:
    #     print(f"Could not extract answer for Question ID {row['id']}: {full_answer}")
    #     predicted_answer = "Unknown"
    
    # Extract the answer from the full output
    if match := re.search(r"Assistant:\s*([A-D])", full_answer):
        predicted_answer = match.group(1)  # Extract the letter (A, B, C, or D)
    else:
        print(f"Could not extract answer for Question ID {row['id']}: {full_answer}")
        predicted_answer = "Unknown"
    
    predicted_answer = [predicted_answer]

    # Extract only the final answer (the option letter)
    # predicted_answer = full_answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    # noise_type = row["modified_question_function_name"]
    actual_answer = [row["answer"][2]]

    # Compare with the actual answer and update counters
    if predicted_answer == actual_answer:
        correct_cnt += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']}")

# Calculate Accuracy
print(f"Total Correct:  {correct_cnt}")
print(f"Total Questions:  {total_questions}")
print(f"Accuracy: {(correct_cnt/total_questions) * 100} % ")

In [None]:


# Load the dataset
data = pd.read_csv("[without images]1_correct_validation.csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

#Get the correct count
correct_cnt = 0
total_questions = 0    


# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "trick":
        continue  # Skip non-count categories
    total_questions += 1
    # Load the corresponding image
    image_path = f"1_correct_validation_images/{row['path']}"
    
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found for Question ID {row['id']} at path: {image_path}. Skipping...")
        continue
    except Exception as e:
        print(f"Error loading image for Question ID {row['id']} at path: {image_path}. Error: {e}. Skipping...")
        continue

    # Prepare the prompt template
    question_type = row["category"]
    question = row["question"]
    options = {
        "A": row["A"],
        "B": row["B"],
        "C": row["C"],
        "D": row["D"]
    }

    prompt_text = (
        f"The following are multiple choice questions about {question_type}. "
        "You should directly answer the question by choosing the correct option given the image and the question. "
        "Give only the letter indicating the correct answer e.g. 'A'\n"
        f"Question: {question}\n"
        "Options:\n"
        f"A. {options['A']}\n"
        f"B. {options['B']}\n"
        f"C. {options['C']}\n"
        f"D. {options['D']}\n"
        "Answer:"
    )

    # Define conversation for the processor
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt_text},
            ],
        },
    ]
    
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    # Prepare inputs
    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # Generate model output
    output = model.generate(**inputs, max_new_tokens=1000)
    full_answer = processor.batch_decode(output, skip_special_tokens=True)[0]
    
    # print(f"Full model output for Question ID {row['id']}: {full_answer}")
    # print(f"Full model output for Question ID: {full_answer}")
    
    # # Extract the predicted answer
    # if "Assistant:" in full_answer:
    #     predicted_answer = full_answer.split("Assistant:")[-1].strip().split()[0]  # String-based extraction
    # else:
    #     print(f"Could not extract answer for Question ID {row['id']}: {full_answer}")
    #     predicted_answer = "Unknown"
    
    # Extract the answer from the full output
    if match := re.search(r"Assistant:\s*([A-D])", full_answer):
        predicted_answer = match.group(1)  # Extract the letter (A, B, C, or D)
    else:
        print(f"Could not extract answer for Question ID {row['id']}: {full_answer}")
        predicted_answer = "Unknown"
    
    predicted_answer = [predicted_answer]

    # Extract only the final answer (the option letter)
    # predicted_answer = full_answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    # noise_type = row["modified_question_function_name"]
    actual_answer = [row["answer"][2]]

    # Compare with the actual answer and update counters
    if predicted_answer == actual_answer:
        correct_cnt += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']}")

# Calculate Accuracy
print(f"Total Correct:  {correct_cnt}")
print(f"Total Questions:  {total_questions}")
print(f"Accuracy: {(correct_cnt/total_questions) * 100} % ")

In [None]:


# Load the dataset
data = pd.read_csv("[without images]1_correct_validation.csv")

# Initialize counters for accuracy per noise type
noise_type_accuracy = defaultdict(lambda: {"correct": 0, "total": 0})

#Get the correct count
correct_cnt = 0
total_questions = 0    


# Iterate over rows where category is "count"
for index, row in data.iterrows():
    if row["category"] != "vcr":
        continue  # Skip non-count categories
    total_questions += 1
    # Load the corresponding image
    image_path = f"1_correct_validation_images/{row['path']}"
    
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found for Question ID {row['id']} at path: {image_path}. Skipping...")
        continue
    except Exception as e:
        print(f"Error loading image for Question ID {row['id']} at path: {image_path}. Error: {e}. Skipping...")
        continue

    # Prepare the prompt template
    question_type = row["category"]
    question = row["question"]
    options = {
        "A": row["A"],
        "B": row["B"],
        "C": row["C"],
        "D": row["D"]
    }

    prompt_text = (
        f"The following are multiple choice questions about {question_type}. "
        "You should directly answer the question by choosing the correct option given the image and the question. "
        "Give only the letter indicating the correct answer e.g. 'A'\n"
        f"Question: {question}\n"
        "Options:\n"
        f"A. {options['A']}\n"
        f"B. {options['B']}\n"
        f"C. {options['C']}\n"
        f"D. {options['D']}\n"
        "Answer:"
    )

    # Define conversation for the processor
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt_text},
            ],
        },
    ]
    
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    # Prepare inputs
    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # Generate model output
    output = model.generate(**inputs, max_new_tokens=1000)
    full_answer = processor.batch_decode(output, skip_special_tokens=True)[0]
    
    # print(f"Full model output for Question ID {row['id']}: {full_answer}")
    # print(f"Full model output for Question ID: {full_answer}")
    
    # # Extract the predicted answer
    # if "Assistant:" in full_answer:
    #     predicted_answer = full_answer.split("Assistant:")[-1].strip().split()[0]  # String-based extraction
    # else:
    #     print(f"Could not extract answer for Question ID {row['id']}: {full_answer}")
    #     predicted_answer = "Unknown"
    
    # Extract the answer from the full output
    if match := re.search(r"Assistant:\s*([A-D])", full_answer):
        predicted_answer = match.group(1)  # Extract the letter (A, B, C, or D)
    else:
        print(f"Could not extract answer for Question ID {row['id']}: {full_answer}")
        predicted_answer = "Unknown"
    
    predicted_answer = [predicted_answer]

    # Extract only the final answer (the option letter)
    # predicted_answer = full_answer.split()[-1]  # Get the last word (e.g., "A", "B", "C", or "D")

    # Get the noise type
    # noise_type = row["modified_question_function_name"]
    actual_answer = [row["answer"][2]]

    # Compare with the actual answer and update counters
    if predicted_answer == actual_answer:
        correct_cnt += 1

    # Print the prediction
    print(f"Question ID {row['id']} - Predicted Answer: {predicted_answer} - Actual Answer: {row['answer']}")

# Calculate Accuracy
print(f"Total Correct:  {correct_cnt}")
print(f"Total Questions:  {total_questions}")
print(f"Accuracy: {(correct_cnt/total_questions) * 100} % ")