In [6]:
import pandas as pd
import anthropic
import os
import sys
sys.path.append('..')
from config import ANTHROPIC_API_KEY 
import openpyxl
import requests
import io 
from io import BytesIO
import base64
import sys
from PIL import Image

# Initialize Claude client
if not ANTHROPIC_API_KEY:
    raise ValueError("Please set ANTHROPIC_API_KEY environment variable")
client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

# System prompt matching the larger script
SYSTEM_PROMPT = """You must reply with NO explanations, NO headers, NO extra text.
Language: ENGLISH. Keep the output strictly in the required format.

You will receive ONE medical question with:
- Text fields: QuestionID, Question, options A..E (some may be null).
- Optionally ONE image immediately after the text (when provided).

Task: choose EXACTLY ONE correct option among A, B, C, D, E for the QuestionID.
You may consider the image when present.

STRICT output format:
QuestionID, LETTER

Example:
ES3341, B

RULES:
- Output EXACTLY ONE line in the exact format above.
- Do NOT repeat the instructions.
- Do NOT include the option text, ONLY the letter."""

# New system prompt that asks for reasoning
SYSTEM_PROMPT_WITH_REASONING = """You will receive ONE medical question with:
- Text fields: QuestionID, Question, options A..E (some may be null).
- Optionally ONE image immediately after the text (when provided).

Task: 
1. Choose EXACTLY ONE correct option among A, B, C, D, E for the QuestionID.
2. Provide your reasoning for selecting this answer.

Output format:
QuestionID, LETTER
Reasoning: [Your detailed reasoning here]

Example:
ES3341, B
Reasoning: The image shows characteristic signs of..."""

USER_LEAD = "Below is a single record. Use ONLY the relevant information."

def build_content_like_main_script(question_id: str, question_text: str, options: dict) -> list:
    """Build content in the same format as the main script"""
    content = []
    
    # Start with the lead text
    lines = [USER_LEAD, f"QuestionID: {question_id}"]
    
    # Add question text
    if question_text:
        lines.append(f"Question: {question_text}")
    
    # Add options A through E
    for label in ["A", "B", "C", "D", "E"]:
        option_text = options.get(label, "")
        lines.append(f"{label}) {option_text if option_text else ''}")
    
    # Create text content block
    content.append({
        "type": "text",
        "text": "\n".join(lines)
    })
    
    return content

def call_claude_with_image(content, use_reasoning=False):
    """Call Claude API and return parsed response"""
    try:
        response = client.messages.create(
            model="claude-sonnet-4-5-20250929",
            max_tokens=500 if use_reasoning else 10,
            system=SYSTEM_PROMPT_WITH_REASONING if use_reasoning else SYSTEM_PROMPT,
            messages=[{
                "role": "user",
                "content": content
            }]
        )
        
        # Extract Claude's answer
        text = ""
        for block in response.content:
            if block.type == "text":
                text += block.text
        text = text.strip()
        
        return text
        
    except Exception as e:
        print(f"Error calling Claude API: {e}")
        return None

def parse_claude_response(text):
    """Parse Claude's response to extract answer letter"""
    import re
    LINE_RX = re.compile(r'^\s*([^,]+)\s*,\s*([A-Ea-e])\s*$', re.UNICODE)
    
    if text:
        first_line = text.splitlines()[0] if text else ""
        m = LINE_RX.match(first_line)
        if m:
            qid_out = m.group(1).strip()
            letter = m.group(2).upper().strip()
            return letter
        else:
            return "PARSE_ERROR"
    else:
        return "NO_RESPONSE"

# Load your Excel data
df = pd.read_excel("../data/subset_with_images.xlsx", sheet_name="SSM_Q_ITA")
picture_link_col = df.columns.get_loc('picture_link') + 1
workbook = openpyxl.load_workbook("../data/subset_with_images.xlsx")
worksheet = workbook["SSM_Q_ITA"]

# Path to your fake image
FAKE_IMAGE_PATH = "../data/Fake_Image_path/image.png"  # UPDATE THIS PATH

# Loop through questions
results = []
skipped_questions = []

for index, row in df.iterrows():
    question_id = row['questionID']
    question_text = row['question_text']
    correct_answer = row['correct_option']
    excel_row = index + 2
    
    # Check if picture_link column exists and get its value
    picture_link = row.get('picture_link', 'N/A')
    has_image = False
    real_image = None
    
    # Try to load real image if available
    if picture_link != 'N/A' and pd.notna(picture_link):
        cell = worksheet.cell(row=excel_row, column=picture_link_col)
        if cell.hyperlink and cell.hyperlink.target:
            try:
                url = cell.hyperlink.target
                file_id = url.split("/d/")[1].split("/")[0]
                download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
                response = requests.get(download_url, timeout=30)
                img = Image.open(BytesIO(response.content))
                
                # Convert RGBA to RGB
                if img.mode in ('RGBA', 'LA'):
                    background = Image.new('RGB', img.size, (255, 255, 255))
                    background.paste(img, mask=img.split()[-1])
                    img = background
                
                real_image = img
                has_image = True
                print(f"Successfully loaded image for question {question_id}")
            except Exception as e:
                print(f"Failed to load image for question {question_id}: {e}")
    
    # Skip questions with invalid correct_option
    if pd.isna(correct_answer) or correct_answer is None:
        print(f"Skipping {question_id}: correct_option is NaN/None")
        skipped_questions.append({
            "question_id": question_id,
            "reason": "correct_option is NaN/None"
        })
        continue

    # Convert to string and handle blanks
    correct_answer_str = str(correct_answer).strip().upper()

    if not correct_answer_str or correct_answer_str not in ['A', 'B', 'C', 'D', 'E']:
        print(f"Skipping {question_id}: Invalid correct_option '{correct_answer}'")
        skipped_questions.append({
            "question_id": question_id,
            "reason": f"Invalid correct_option: {correct_answer}"
        })
        continue
    
    # Build options dictionary
    options = {
        "A": row['option_a'],
        "B": row['option_b'], 
        "C": row['option_c'],
        "D": row['option_d'],
        "E": row['option_e']
    }
    
    # Initialize result dictionary
    result = {
        "question_id": question_id,
        "question": question_text,
        "correct_answer": correct_answer.upper(),
        "has_image": has_image
    }
    
    # If question has image, process with both real and fake images
    if has_image and real_image is not None:
        print(f"\n{'='*60}")
        print(f"Processing question with IMAGE: {question_id}")
        print(f"{'='*60}")
        
        # Build base content
        content_base = build_content_like_main_script(question_id, question_text, options)
        
        # ==== REAL IMAGE ====
        print(f"\n--- Testing with REAL image ---")
        content_real = content_base.copy()
        
        # Add real image
        buffer = BytesIO()
        real_image.save(buffer, format='JPEG')
        image_data_real = base64.b64encode(buffer.getvalue()).decode('utf-8')
        
        content_real.append({
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/jpeg",
                "data": image_data_real
            }
        })
        
        # Call Claude with real image and reasoning
        response_real = call_claude_with_image(content_real, use_reasoning=True)
        
        if response_real:
            print(f"Claude's response with REAL image:\n{response_real}")
            claude_answer_real = parse_claude_response(response_real)
            result["claude_answer_real"] = claude_answer_real
            result["claude_response_real"] = response_real
            result["is_correct_real"] = (correct_answer.upper() == claude_answer_real)
        else:
            result["claude_answer_real"] = "API_ERROR"
            result["claude_response_real"] = "API_ERROR"
            result["is_correct_real"] = False
        
        # ==== FAKE IMAGE ====
        print(f"\n--- Testing with FAKE image ---")
        content_fake = content_base.copy()
        
        # Load and add fake image
        try:
            fake_img = Image.open(FAKE_IMAGE_PATH)
            if fake_img.mode in ('RGBA', 'LA'):
                background = Image.new('RGB', fake_img.size, (255, 255, 255))
                background.paste(fake_img, mask=fake_img.split()[-1])
                fake_img = background
            
            buffer_fake = BytesIO()
            fake_img.save(buffer_fake, format='JPEG')
            image_data_fake = base64.b64encode(buffer_fake.getvalue()).decode('utf-8')
            
            content_fake.append({
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/jpeg",
                    "data": image_data_fake
                }
            })
            
            # Call Claude with fake image and reasoning
            response_fake = call_claude_with_image(content_fake, use_reasoning=True)
            
            if response_fake:
                print(f"Claude's response with FAKE image:\n{response_fake}")
                claude_answer_fake = parse_claude_response(response_fake)
                result["claude_answer_fake"] = claude_answer_fake
                result["claude_response_fake"] = response_fake
                result["is_correct_fake"] = (correct_answer.upper() == claude_answer_fake)
            else:
                result["claude_answer_fake"] = "API_ERROR"
                result["claude_response_fake"] = "API_ERROR"
                result["is_correct_fake"] = False
                
        except Exception as e:
            print(f"Error loading fake image: {e}")
            result["claude_answer_fake"] = "FAKE_IMAGE_ERROR"
            result["claude_response_fake"] = f"Error: {e}"
            result["is_correct_fake"] = False
        
        print(f"\n{'='*60}")
        print(f"Correct answer: {correct_answer.upper()}")
        print(f"Claude with REAL: {result.get('claude_answer_real', 'N/A')} {'✓' if result.get('is_correct_real', False) else '✗'}")
        print(f"Claude with FAKE: {result.get('claude_answer_fake', 'N/A')} {'✓' if result.get('is_correct_fake', False) else '✗'}")
        print(f"{'='*60}\n")
        
    else:
        # No image - process normally (optional: you can skip these)
        print(f"Processing question WITHOUT image: {question_id}")
        content = build_content_like_main_script(question_id, question_text, options)
        
        response_text = call_claude_with_image(content, use_reasoning=False)
        
        if response_text:
            claude_answer = parse_claude_response(response_text)
            result["claude_answer"] = claude_answer
            result["is_correct"] = (correct_answer.upper() == claude_answer)
        else:
            result["claude_answer"] = "API_ERROR"
            result["is_correct"] = False
    
    results.append(result)

# Print summary
total_questions = len(results)
questions_with_images = sum(1 for r in results if r.get("has_image", False))
questions_without_images = total_questions - questions_with_images

print(f"\n{'='*60}")
print(f"=== SUMMARY ===")
print(f"{'='*60}")
print(f"Total questions processed: {total_questions}")
print(f"Questions with images: {questions_with_images}")
print(f"Questions without images: {questions_without_images}")
print(f"Questions skipped: {len(skipped_questions)}")

if questions_with_images > 0:
    correct_real = sum(1 for r in results if r.get("is_correct_real", False))
    correct_fake = sum(1 for r in results if r.get("is_correct_fake", False))
    
    accuracy_real = correct_real / questions_with_images * 100
    accuracy_fake = correct_fake / questions_with_images * 100
    
    print(f"\n--- Results with REAL images ---")
    print(f"Correct answers: {correct_real}/{questions_with_images}")
    print(f"Accuracy: {accuracy_real:.1f}%")
    
    print(f"\n--- Results with FAKE images ---")
    print(f"Correct answers: {correct_fake}/{questions_with_images}")
    print(f"Accuracy: {accuracy_fake:.1f}%")

if questions_without_images > 0:
    correct_no_image = sum(1 for r in results if not r.get("has_image", False) and r.get("is_correct", False))
    accuracy_no_image = correct_no_image / questions_without_images * 100
    print(f"\n--- Results WITHOUT images ---")
    print(f"Correct answers: {correct_no_image}/{questions_without_images}")
    print(f"Accuracy: {accuracy_no_image:.1f}%")

# Print skipped questions summary
if skipped_questions:
    print(f"\n=== SKIPPED QUESTIONS ===")
    for skip in skipped_questions:
        print(f"{skip['question_id']}: {skip['reason']}")

# Save results to file
results_df = pd.DataFrame(results)
results_df.to_csv("revision_11_25/claude_results_real_vs_fake_images_11_25_1.csv", index=False)
print(f"\nResults saved to claude_results_real_vs_fake_images_11_25_1.csv")

# Also save skipped questions for reference
if skipped_questions:
    skipped_df = pd.DataFrame(skipped_questions)
    skipped_df.to_csv("revision_11_25/skipped_questions_11_25_1.csv", index=False)
    print(f"Skipped questions saved to skipped_questions_original.csv")

Successfully loaded image for question IT0006

Processing question with IMAGE: IT0006

--- Testing with REAL image ---
Claude's response with REAL image:
IT0006, C

Reasoning: This ECG shows characteristic features of an anterior wall myocardial infarction:

1. **ST-segment elevation in precordial leads**: There is clear ST-segment elevation visible in the anterior precordial leads (V1-V6), which is the hallmark of an acute ST-elevation myocardial infarction (STEMI).

2. **Lead distribution**: The ST elevation is predominantly seen in leads V1 through V6, which correspond to the anterior wall of the left ventricle. The precordial leads overlying the anterior chest wall reflect electrical activity from the anterior myocardium.

3. **Absence of inferior changes**: Leads II, III, and aVF (inferior leads) do not show the ST elevations that would be expected in an inferior wall MI (option D).

4. **Regular rhythm with P waves**: The rhythm appears regular with visible P waves preceding QRS 

OSError: Cannot save file into a non-existent directory: 'revision_11_25'

In [8]:
results_df.to_csv("results/claude_results_real_vs_fake_images_11_25_1.csv", index=False)


In [4]:
import os

os.getcwd()


'c:\\Users\\zilef\\OneDrive\\Documents\\eurips2025-mmrl4h-italian-medvqa-visual-grounding\\revision_11_25'