In [1]:
import os
import json
import base64
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from datetime import datetime
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

# Local imports
from predict import predict_image
from prompt_templates import prompt_templates, comparison_prompt

print("✅ Aufgabe 4 Final Evaluation System")
print("🎯 Ready to test complete workflow!")

# Setup paths
VAL_DIR = r"C:\Users\egese\Desktop\dataset\val\SAP"          # Student solutions (only for the test)
RESULTS_DIR = "results/validation_analysis/"
REFERENCE_DIR = "results/reference_solutions/"
FINAL_OUTPUT = "results/aufgabe4_final/"

# Create output directory
os.makedirs(FINAL_OUTPUT, exist_ok=True)

print(f"📁 Validation images: {VAL_DIR}")
print(f"📊 Analysis results: {RESULTS_DIR}")
print(f"🎯 Reference solutions: {REFERENCE_DIR}")
print(f"🏆 Final output: {FINAL_OUTPUT}")


✅ Aufgabe 4 Final Evaluation System
🎯 Ready to test complete workflow!
📁 Validation images: C:\Users\egese\Desktop\dataset\val\SAP
📊 Analysis results: results/validation_analysis/
🎯 Reference solutions: results/reference_solutions/
🏆 Final output: results/aufgabe4_final/


In [31]:
class Aufgabe4EvaluationSystem:
    def __init__(self, api_key):
        """Complete evaluation system for Aufgabe 4"""
        self.api_key = api_key
        self.setup_api()
        
        # Load reference solutions
        self.reference_solutions = {}
        self.load_reference_database()
        
        # Evaluation results
        self.evaluation_results = []
        
    def setup_api(self):
        """Setup OpenRouter API"""
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        # Use more reliable model for complex multi-image tasks
        self.model = "qwen/qwen2.5-vl-32b-instruct" # Better multi-image support
        # Alternative models if current fails:
        # self.model = "qwen/qwen-vl-max" # Backup model 1
        # self.model = "anthropic/claude-3-sonnet:beta" # Backup model 2
        print(f"✅ API ready for evaluation (Model: {self.model})")
        print("🔧 Using Qwen2.5-VL-32B for improved JSON processing")
    
    def switch_model(self, model_name):
        """Switch to a different model if current one has issues"""
        old_model = self.model
        self.model = model_name
        print(f"🔄 Switched model: {old_model} → {model_name}")
    
    def load_reference_database(self):
        """Load reference solutions from JSON files"""
        # Try multiple possible locations for reference database
        possible_paths = [
            "reference_database_latest.json",
            "reference_database_auto_*.json", 
            os.path.join(REFERENCE_DIR, "reference_solutions_*.json"),
            os.path.join(REFERENCE_DIR, "reference_database_*.json")
        ]
        
        reference_file = None
        
        # Check for latest reference database in current directory first
        if os.path.exists("reference_database_latest.json"):
            reference_file = "reference_database_latest.json"
            print("✅ Found reference_database_latest.json")
        else:
            # Look for auto-generated files
            import glob
            auto_files = glob.glob("reference_database_auto_*.json")
            if auto_files:
                reference_file = sorted(auto_files)[-1]  # Get latest
                print(f"✅ Found auto-generated reference: {reference_file}")
            else:
                # Check reference directory if it exists
                if os.path.exists(REFERENCE_DIR):
                    ref_files = [f for f in os.listdir(REFERENCE_DIR) if f.startswith('reference_')]
                    if ref_files:
                        latest_ref = sorted(ref_files)[-1]
                        reference_file = os.path.join(REFERENCE_DIR, latest_ref)
                        print(f"✅ Found reference in directory: {latest_ref}")
        
        if reference_file and os.path.exists(reference_file):
            try:
                with open(reference_file, 'r', encoding='utf-8') as f:
                    ref_data = json.load(f)
                    
                # Handle different JSON structures
                if 'references' in ref_data:
                    self.reference_solutions = ref_data['references']
                elif 'categories' in ref_data:
                    self.reference_solutions = ref_data['categories']
                else:
                    # Assume the data itself is the reference structure
                    self.reference_solutions = ref_data
                
                print(f"✅ Loaded reference database: {len(self.reference_solutions)} categories")
                for category, refs in self.reference_solutions.items():
                    if isinstance(refs, list):
                        print(f"   📁 {category}: {len(refs)} reference(s)")
                    else:
                        print(f"   📁 {category}: {type(refs)} data")
                        
            except Exception as e:
                print(f"❌ Error loading reference database: {e}")
                self.reference_solutions = {}
        else:
            print("❌ No reference database found!")
            print("💡 Available files:", [f for f in os.listdir('.') if 'reference' in f])
            self.reference_solutions = {}
    
    def classify_student_submission(self, image_path):
        """CNN classification of student submission"""
        try:
            predicted_class, confidence = predict_image(image_path)
            return {
                "category": predicted_class,
                "confidence": float(confidence),
                "status": "success"
            }
        except Exception as e:
            return {
                "category": None,
                "confidence": 0.0,
                "status": "error",
                "error": str(e)
            }
    
    def get_best_reference(self, category):
        """Get best reference for a category"""
        if category not in self.reference_solutions:
            return None
        
        references = self.reference_solutions[category]
        if not references:
            return None
        
        # Get first (best) reference
        best_ref = references[0]
        ref_filename = best_ref['filename']
        
        # Reference images are in mapped_train, organized by category
        # Try different possible paths
        possible_paths = [
            os.path.join(r"C:\Users\egese\Desktop\dataset\mapped_train", category, ref_filename),
            os.path.join(r"C:\Users\egese\Desktop\dataset\mapped_train", category.replace(" ", "-"), ref_filename),
            os.path.join(r"C:\Users\egese\Desktop\dataset\mapped_train", category.replace("-", " "), ref_filename),
            os.path.join(VAL_DIR, ref_filename),  # Fallback to val directory
        ]
        
        for ref_path in possible_paths:
            if os.path.exists(ref_path):
                return {
                    "filename": ref_filename,
                    "path": ref_path,
                    "details": best_ref,
                    "category_path": category
                }
        
        print(f"⚠️ Reference image not found: {ref_filename} for category {category}")
        print(f"   Tried paths: {possible_paths}")
        return None
    
    def get_image_media_type(self, image_path):
        """Detect actual image format and return correct media type"""
        try:
            with Image.open(image_path) as img:
                format_name = img.format.lower()
                if format_name == 'jpeg':
                    return 'image/jpeg'
                elif format_name == 'png':
                    return 'image/png'
                elif format_name == 'gif':
                    return 'image/gif'
                elif format_name == 'webp':
                    return 'image/webp'
                else:
                    return 'image/jpeg'  # Default fallback
        except Exception:
            # Fallback based on file extension
            ext = os.path.splitext(image_path)[1].lower()
            if ext in ['.png']:
                return 'image/png'
            elif ext in ['.gif']:
                return 'image/gif'
            elif ext in ['.webp']:
                return 'image/webp'
            else:
                return 'image/jpeg'  # Default
    
    def encode_image(self, image_path):
        """Encode image to base64 with correct media type"""
        try:
            with open(image_path, "rb") as image_file:
                base64_data = base64.b64encode(image_file.read()).decode('utf-8')
                media_type = self.get_image_media_type(image_path)
                return base64_data, media_type
        except Exception as e:
            print(f"❌ Image encoding error: {e}")
            return None, None
    
    def get_category_prompt(self, category):
        """Get category-specific prompt from prompt_templates.py"""
        # Mapping zwischen CNN predictions und prompt_templates keys
        category_mapping = {
            "Excel-Tabelle": "Excel-Tabelle",
            "Data-Flow": "Data-Flow", 
            "Data-Transfer-Process": "Data-Transfer-Process",
            "Transformation": "Transformation",
            "Data Source": "Data Source",
            "Info-Object": "Info-Object",
            # Alternative Namensgebungen falls nötig
            "Data-Transfer": "Data-Transfer-Process",
            "DataFlow": "Data-Flow",
            "DataSource": "Data Source",
            "InfoObject": "Info-Object",
            "Excel": "Excel-Tabelle"
        }
        
        # Richtige Kategorie finden
        mapped_category = category_mapping.get(category, category)
        
        if mapped_category in prompt_templates:
            print(f"   📋 Using category-specific prompt for: {mapped_category}")
            return prompt_templates[mapped_category]
        else:
            print(f"⚠️ No specific prompt for category '{category}', using generic prompt")
            return self.get_generic_prompt()
    
    def get_generic_prompt(self):
        """Generic prompt if category-specific not found"""
        return """
Analysiere das Bild und bewerte nach folgenden Kriterien:
{
  "struktur_qualitaet": {
    "aufbau_logisch": true/false,
    "elemente_erkennbar": true/false,
    "vollstaendigkeit": 0-10,
    "score": 0-10
  },
  "technische_qualitaet": {
    "lesbarkeit": "gut/mittel/schlecht",
    "detailgrad": "zu wenig/angemessen/zu viel", 
    "fachliche_korrektheit": 0-10,
    "score": 0-10
  },
  "sap_kontext": {
    "sap_bw_relevant": true/false,
    "terminologie_korrekt": true/false,
    "business_kontext": "erkennbar/unklar",
    "score": 0-10
  },
  "gesamt_score": 0-10,
  "verbesserungsvorschlaege": ["Konkrete Hinweise"]
}
""",
    
    def _make_api_call(self, messages, max_tokens=2048):
        """Helper function to make a generic API call."""
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
            "HTTP-Referer": "http://localhost:8888", # Or your app's URL
            "X-Title": "Aufgabe 4 Evaluation"
        }
        data = {
            "model": self.model,
            "max_tokens": max_tokens,
            "temperature": 0.1,
            "messages": messages
        }
        try:
            response = requests.post(self.base_url, headers=headers, json=data, timeout=120)
            if response.status_code == 200:
                result = response.json()
                content = result.get('choices', [{}])[0].get('message', {}).get('content', '')
                return {"response": content}
            else:
                return {"error": f"API Error HTTP {response.status_code}: {response.text}"}
        except Exception as e:
            return {"error": f"Request Exception: {str(e)}"}

    def _check_image_quality(self, image_path):
        """Check if image is suitable for evaluation (Schritt 1)"""
        print(f"   Prüfe Bildqualität: {os.path.basename(image_path)}")
        base64_image, media_type = self.encode_image(image_path)
        if not base64_image:
            return {"error": "Failed to encode image"}

        prompt_text = """SCHRITT 1: PRÜFUNG DER EIGNUNG ZUR BEWERTUNG

Prüfe, ob das Studenten-Bild für eine Bewertung GEEIGNET ist. 
Ein Bild ist **NICHT GEEIGNET** (`"is_evaluable": false`), wenn einer der folgenden Punkte zutrifft:

**Fokus auf UI-Elemente statt Inhalt:**
- Das Bild zeigt hauptsächlich ein Menü (Rechtsklick), Dropdown-Liste, Dialogbox oder Popup-Fehlermeldung
- Das Bild zeigt den Login-Screen, SAP-Startseite (Easy Access) oder generische Transaktionsauswahl

**Qualität und Lesbarkeit:**
- Das Bild ist stark verpixelt, unscharf oder niedrig aufgelöst
- Das Bild ist extrem dunkel, überbelichtet oder hat geringen Kontrast

**Falscher Bildausschnitt:**
- Das Bild ist zu stark herangezoomt (nur winziges Detail sichtbar)
- Das Bild ist zu weit herausgezoomt (SAP-Fenster sehr klein im Screenshot)

**Irrelevanter Inhalt:**
- Das Bild zeigt ABAP-Code statt grafisches Modell
- Das Bild zeigt andere Anwendung (Windows Explorer, etc.)
- Das Bild zeigt leeren/Lade-Bildschirm oder ist unvollständig
- Es gibt irrelevante Inhalte auf dem Bildschirm

Gib NUR folgendes JSON zurück:

{
  "is_evaluable": true/false,
  "reason": "Kurze Begründung warum geeignet/nicht geeignet"
}

WICHTIG: Antworte NUR mit JSON - keine Markdown-Blöcke, keine Erklärung."""
        
        messages = [{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{base64_image}"}},
                {"type": "text", "text": prompt_text}
            ]
        }]
        
        quality_result = self._make_api_call(messages)
        return self.parse_evaluation_response(quality_result)

    def _analyze_single_image(self, image_path, category_prompt):
        """Analyzes a single image using the LLM and returns the JSON analysis."""
        print(f"   Analysiere Bild: {os.path.basename(image_path)}")
        base64_image, media_type = self.encode_image(image_path)
        if not base64_image:
            return {"error": "Failed to encode image"}

        prompt_text = f"""Du bist ein SAP BW Experte. Analysiere das Bild detailliert und gib deine Bewertung als JSON zurück.

BEWERTUNGSSCHEMA (exakt in diesem Format antworten):
{category_prompt}

WICHTIGE REGELN:
- Verwende EXAKT die Felder aus dem Schema oben
- Alle score-Werte sind zwischen 0-10
- Alle true/false Werte sind boolean
- Alle Textwerte sind Strings in Anführungszeichen
- Gib NUR das JSON zurück - keine Markdown-Blöcke, keine Erklärung

Antworte NUR mit dem JSON-Objekt:"""
        
        messages = [{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{base64_image}"}},
                {"type": "text", "text": prompt_text}
            ]
        }]
        
        analysis_result = self._make_api_call(messages)
        return self.parse_evaluation_response(analysis_result)

    def debug_single_image_passthrough(self, image_path):
        """A simple test to see if the model can see ONE image."""
        print("\n--- 🕵️ RUNNING SINGLE IMAGE DEBUG TEST ---")
        student_b64, student_media_type = self.encode_image(image_path)
        if not student_b64:
            return {"error": "Failed to encode image"}

        messages = [{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": f"data:{student_media_type};base64,{student_b64}"}},
                {"type": "text", "text": "Describe this image in detail in English. What do you see?"}
            ]
        }]
        
        try:
            print("   ...Sending request to model...")
            result = self._make_api_call(messages, max_tokens=1024)
            
            if "error" not in result:
                print("\n--- ✅ Model Response ---")
                print(result["response"])
                return result
            else:
                print("\n--- ❌ Error Response ---")
                print(result["error"])
                return result
        except Exception as e:
            return {"error": f"Request failed: {str(e)}"}


    
    def compare_with_reference(self, student_path, reference_info, category):
        """Compare student submission with reference using 4-step approach with quality check"""
        
        # STEP 0: Check if student image is evaluable (quality control)
        quality_check = self._check_image_quality(student_path)
        if "error" in quality_check:
            return quality_check
            
        # If image is not suitable for evaluation, return early with detailed reason
        if not quality_check.get("is_evaluable", False):
            return {
                "qualitaetspruefung": {
                    "is_evaluable": False,
                    "reason": quality_check.get('reason', 'Bild nicht für Bewertung geeignet')
                },
                "score": 0,
                "detailed_scores": {
                    "structure": 0,
                    "technical_quality": 0, 
                    "completeness": 0,
                    "correctness": 0
                },
                "strengths": [],
                "weaknesses": ["Bild ist nicht für Bewertung geeignet"],
                "suggestions": ["Bitte ein aussagekräftiges Screenshot der SAP BW Modellierung einreichen"],
                "category_specific_feedback": f"Nicht bewertbar: {quality_check.get('reason', 'Unbekannter Grund')}",
                "evaluation_status": "not_evaluable"
            }
        
        print(f"   ✅ Bildqualität OK: {quality_check.get('reason', '')}")
        
        # STEP 1: Analyze student image
        category_prompt = self.get_category_prompt(category)
        student_analysis = self._analyze_single_image(student_path, category_prompt)
        if "error" in student_analysis:
            return {"error": f"Student image analysis failed: {student_analysis['error']}"}

        # STEP 2: Analyze reference image  
        reference_analysis = self._analyze_single_image(reference_info['path'], category_prompt)
        if "error" in reference_analysis:
            return {"error": f"Reference image analysis failed: {reference_analysis['error']}"}
        
        # STEP 3: Compare the two analyses using SAME category-specific format
        print("   Vergleiche die beiden Analysen...")
        
        # Get the category-specific template for comparison format
        comparison_template = self.get_category_prompt(category)
        
        comparison_prompt_template = """Du bist ein SAP BW Experte. Vergleiche die beiden kategorie-spezifischen Bildanalysen der Kategorie "{category}".

=== STUDENT ANALYSE ===
{student_analysis}

=== REFERENZ ANALYSE (Musterlösung) ===
{reference_analysis}

AUFGABE:
1. Vergleiche die beiden Analysen Punkt für Punkt
2. Bewerte jeden Aspekt basierend auf der Referenz  
3. Gib das Ergebnis im GLEICHEN Format wie die ursprünglichen Analysen zurück

WICHTIG: Verwende EXAKT das gleiche JSON-Schema wie die Einzelanalysen:
{comparison_template}

Aber ersetze die Werte mit BEWERTUNGEN statt Analyseergebnissen:
- Für score-Felder: Gib Punkte 0-10 basierend auf Vergleich mit Referenz
- Für boolean-Felder: true wenn Student gut abschneidet, false wenn schlecht
- Für Text-Felder: Bewertungskommentare statt Analyseergebnisse
- Für Array-Felder: Konkrete Verbesserungsvorschläge

Zusätzlich füge am Ende hinzu:
"bewertungs_zusammenfassung": {{
  "gesamtpunktzahl": <0-100>,
  "note": <1.0-5.0>,
  "bestanden": true/false,
  "hauptkritikpunkte": ["Punkt 1", "Punkt 2"],
  "verbesserungsempfehlungen": ["Empfehlung 1", "Empfehlung 2"]
}}

Antworte NUR mit dem JSON-Objekt (keine Markdown-Blöcke):"""

        comparison_prompt = comparison_prompt_template.format(
            student_analysis=json.dumps(student_analysis, indent=2),
            reference_analysis=json.dumps(reference_analysis, indent=2),
            category=category,
            comparison_template=comparison_template
        )
        
        messages = [{
            "role": "user", 
            "content": [{"type": "text", "text": comparison_prompt}]
        }]
        
        comparison_result = self._make_api_call(messages)
        parsed_result = self.parse_evaluation_response(comparison_result)
        
        # Add quality check info to result
        if isinstance(parsed_result, dict) and "error" not in parsed_result:
            parsed_result["qualitaetspruefung"] = quality_check
            parsed_result["evaluation_status"] = "evaluated"
            
        return parsed_result
    
    def parse_evaluation_response(self, response):
        """Parse LLM evaluation response to JSON"""
        if "error" in response:
            return response
        
        text_output = response.get('response', '')
        
        # Clean up common LLM formatting issues
        text_output = text_output.strip()
        
        # Remove markdown code blocks if present
        if text_output.startswith('```json'):
            text_output = text_output[7:]  # Remove ```json
        if text_output.startswith('```'):
            text_output = text_output[3:]   # Remove ```
        if text_output.endswith('```'):
            text_output = text_output[:-3]  # Remove trailing ```
        
        # Find the JSON object
        start_idx = text_output.find('{')
        
        if start_idx != -1:
            # Find the matching closing brace
            brace_count = 0
            end_idx = start_idx
            for i, char in enumerate(text_output[start_idx:], start_idx):
                if char == '{':
                    brace_count += 1
                elif char == '}':
                    brace_count -= 1
                    if brace_count == 0:
                        end_idx = i + 1
                        break
            
            if brace_count == 0:  # Found matching closing brace
                json_str = text_output[start_idx:end_idx]
                try:
                    parsed = json.loads(json_str)
                    return parsed
                except json.JSONDecodeError as e:
                    return {
                        "error": f"JSON parse error: {str(e)}",
                        "raw_response": json_str[:500],
                        "full_response": text_output[:1000]
                    }
        
        return {
            "error": "Valid JSON not found in response",
            "raw_text": text_output[:500]
        }
    
    def evaluate_student_submission(self, student_image_path, student_filename):
        """Complete evaluation of a single student submission"""
        print(f"🔍 Evaluating: {student_filename}")
        
        # Define a confidence threshold
        CONFIDENCE_THRESHOLD = 0.60
        
        # Step 1: CNN Classification
        classification = self.classify_student_submission(student_image_path)
        if classification['status'] != 'success':
            return {
                "filename": student_filename,
                "status": "error",
                "error": "CNN classification failed",
                "classification": classification
            }
        
        category = classification['category']
        confidence = classification['confidence']
        print(f"   📊 Classified as: {category} ({confidence:.2%})")
        
        # **NEW: Quality Gate based on Confidence Score**
        if confidence < CONFIDENCE_THRESHOLD:
            print(f"   ⚠️ Low confidence score ({confidence:.2%}). Evaluation stopped.")
            return {
                "filename": student_filename,
                "status": "error",
                "error": f"Low confidence score. Model is not sure about the category. Score: {confidence:.2%}",
                "classification": classification,
                "evaluation": {"error": "Evaluation aborted due to low classification confidence."}
            }

        # Step 2: Get reference solution
        reference = self.get_best_reference(category)
        if not reference:
            return {
                "filename": student_filename,
                "status": "error", 
                "error": f"No reference solution for category {category}",
                "classification": classification
            }
        
        print(f"   🎯 Reference: {reference['filename']}")
        
        # Step 3: LLM comparison (3-step process)
        evaluation = self.compare_with_reference(student_image_path, reference, category)
        
         # **NEW: LLM-based Quality Gate**
        if evaluation.get('qualitaetspruefung', {}).get('is_evaluable') is False:
            reason = evaluation.get('qualitaetspruefung', {}).get('reason', 'No reason provided.')
            print(f"   ❌ Evaluation stopped by LLM Quality Gate: {reason}")
            return {
                "filename": student_filename,
                "status": "error",
                "error": f"LLM Quality Gate: Image not evaluable. Reason: {reason}",
                "classification": classification,
                "reference_used": reference,
                "evaluation": evaluation
            }

        if "error" in evaluation:
            print(f"   ❌ LLM evaluation failed: {evaluation['error']}")
        else:
            # Check both new and old format keys
            if 'gesamtbewertung' in evaluation:  # New format (single word)
                bewertung = evaluation['gesamtbewertung']
                score = bewertung.get('gesamtpunktzahl', 0)
                grade = bewertung.get('note', 5.0)
                print(f"   ✅ Score: {score}/100, Grade: {grade}")
            elif 'gesamt_bewertung' in evaluation:  # Old format (with underscore)
                bewertung = evaluation['gesamt_bewertung']
                score = bewertung.get('gesamtpunkte', 0)
                grade = bewertung.get('note', 5.0)
                print(f"   ✅ Score: {score}/100, Grade: {grade}")
            else:
                print(f"   ⚠️ Incomplete evaluation response")
                print(f"   Available keys: {list(evaluation.keys())}")
        
        # Compile final result
        result = {
            "filename": student_filename,
            "image_path": student_image_path,
            "classification": classification,
            "reference_used": reference,
            "evaluation": evaluation,
            "timestamp": datetime.now().isoformat(),
            "status": "success" if "error" not in evaluation else "partial"
        }
        
        return result

# Initialize evaluation system
API_KEY = "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"

try:
    evaluator = Aufgabe4EvaluationSystem(API_KEY)
    print("\n🎓 Aufgabe 4 Evaluation System ready!")
    print(f"📊 Categories with references: {len(evaluator.reference_solutions)}")
except Exception as e:
    print(f"❌ Evaluation system setup failed: {e}")
    evaluator = None


✅ API ready for evaluation (Model: qwen/qwen2.5-vl-32b-instruct)
🔧 Using Qwen2.5-VL-32B for improved JSON processing
✅ Found reference in directory: reference_database_latest.json
✅ Loaded reference database: 6 categories
   📁 Data Source: 5 reference(s)
   📁 Data-Flow: 5 reference(s)
   📁 Data-Transfer-Process: 5 reference(s)
   📁 Excel-Tabelle: 5 reference(s)
   📁 Info-Object: 5 reference(s)
   📁 Transformation: 5 reference(s)

🎓 Aufgabe 4 Evaluation System ready!
📊 Categories with references: 6


In [33]:
## 🧪 Quick Test - Single Image Evaluation

# Test the updated system with category-specific prompts
def test_single_image_evaluation():
    """Test evaluation with a single image"""
    if evaluator is None:
        print("❌ Evaluator not initialized")
        return
    
    # Test with a sample image from val/SAP
    test_images = [f for f in os.listdir(VAL_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    if not test_images:
        print("❌ No test images found in val/SAP")
        return
    
    # Pick first available image
    test_image = "8d281e0d909c4b9c842d967421edae11.png"
    test_path = os.path.join(VAL_DIR, test_image)
    
    print(f"🧪 Testing with: {test_image}")
    print(f"📁 Path: {test_path}")
    print("=" * 60)
    
    # Run evaluation
    result = evaluator.evaluate_student_submission(test_path, test_image)
    
    # Display results
    print("\n🎯 EVALUATION RESULTS:")
    print("=" * 60)
    
    if result['status'] == 'success':
        classification = result['classification']
        print(f"📊 CNN Classification: {classification['category']} ({classification['confidence']:.2%})")
        
        reference = result['reference_used']
        print(f"🎯 Reference: {reference['filename']}")
        
        evaluation = result['evaluation']
        if 'error' not in evaluation:
            # Check for category-specific results first
            if 'bewertungs_zusammenfassung' in evaluation:  # New category-specific format
                bewertung = evaluation['bewertungs_zusammenfassung']
                score = bewertung.get('gesamtpunktzahl', 0)
                grade = bewertung.get('note', 5.0)
                passed = bewertung.get('bestanden', False)
                feedback = f"Category-specific evaluation for {classification['category']}"
                
                print(f"✅ Final Score: {score}/100")
                print(f"📝 Grade: {grade}")
                print(f"🎓 Passed: {'Yes' if passed else 'No'}")
                print(f"💬 Feedback: {feedback}")
                
                # Show category-specific analysis structure
                print(f"🏷️ Category-Specific Analysis: {classification['category']}")
                
                # Show main sections from category-specific analysis
                category_sections = [k for k in evaluation.keys() if k not in ['bewertungs_zusammenfassung']]
                if category_sections:
                    print(f"📊 Analysis Sections: {', '.join(category_sections[:4])}")
                
                # Show criticisms and recommendations
                kritik = bewertung.get('hauptkritikpunkte', [])
                empfehlungen = bewertung.get('verbesserungsempfehlungen', [])
                
                if kritik:
                    print(f"⚠️ Main Issues: {len(kritik)} items")
                    for i, issue in enumerate(kritik[:2], 1):
                        print(f"   {i}. {issue}")
                        
                if empfehlungen:
                    print(f"💡 Recommendations: {len(empfehlungen)} items")
                    for i, rec in enumerate(empfehlungen[:2], 1):
                        print(f"   {i}. {rec}")
                        
            elif 'gesamtbewertung' in evaluation:  # Generic format fallback
                bewertung = evaluation['gesamtbewertung']
                score = bewertung.get('gesamtpunktzahl', 0)
                grade = bewertung.get('note', 5.0)
                passed = bewertung.get('bestanden', False)
                feedback = bewertung.get('feedback', 'No feedback')
                
                print(f"✅ Final Score: {score}/100")
                print(f"📝 Grade: {grade}")
                print(f"🎓 Passed: {'Yes' if passed else 'No'}")
                print(f"💬 Feedback: {feedback}")
                
                # Check for suggestions (both formats)
                suggestions = evaluation.get('empfehlungen', evaluation.get('verbesserungsvorschlaege', []))
                if suggestions:
                    print(f"💡 Suggestions: {len(suggestions)} items")
                    for i, suggestion in enumerate(suggestions[:3], 1):
                        print(f"   {i}. {suggestion}")
                        
                # Show category-specific analysis
                if 'kategorie_vergleich' in evaluation:
                    kategorie_info = evaluation['kategorie_vergleich']
                    print(f"🏷️ Category: {kategorie_info.get('kategorie', 'Unknown')}")
                    print(f"📊 Category Match: {kategorie_info.get('student_erfuellt_kategorie', False)}")
                    
                # Show detailed scores
                if 'detailbewertung' in evaluation:
                    detail_scores = evaluation['detailbewertung']
                    print("📈 Detailed Scores:")
                    print(f"   Struktur: {detail_scores.get('struktur_score', 0)}/100")
                    print(f"   Technik: {detail_scores.get('technik_score', 0)}/100") 
                    print(f"   Vollständigkeit: {detail_scores.get('vollstaendigkeit_score', 0)}/100")
                    print(f"   Korrektheit: {detail_scores.get('korrektheit_score', 0)}/100")
                    
            elif 'gesamt_bewertung' in evaluation:  # Old format (with underscore)
                bewertung = evaluation['gesamt_bewertung']
                score = bewertung.get('gesamtpunkte', 0)
                grade = bewertung.get('note', 5.0)
                passed = bewertung.get('bestanden', False)
                feedback = bewertung.get('feedback', 'No feedback')
                
                print(f"✅ Final Score: {score}/100")
                print(f"📝 Grade: {grade}")
                print(f"🎓 Passed: {'Yes' if passed else 'No'}")
                print(f"💬 Feedback: {feedback}")
            else:
                print("⚠️ Unexpected response format")
                print("Raw evaluation keys:", list(evaluation.keys()))
                # Try to extract any available scores
                if 'detailbewertung' in evaluation:
                    detail_scores = evaluation['detailbewertung']
                    avg_score = sum(detail_scores.values()) / len(detail_scores) if detail_scores else 0
                    print(f"📊 Average Score: {avg_score:.1f}/100")
        else:
            print(f"❌ Evaluation error: {evaluation['error']}")
    else:
        print(f"❌ Evaluation failed: {result.get('error', 'Unknown error')}")
    
    return result

# Ready to test!
print("🧪 Ready to test category-specific prompts!")
print("💡 Call test_single_image_evaluation() to run a test")
test_single_image_evaluation()

🧪 Ready to test category-specific prompts!
💡 Call test_single_image_evaluation() to run a test
🧪 Testing with: 8d281e0d909c4b9c842d967421edae11.png
📁 Path: C:\Users\egese\Desktop\dataset\val\SAP\8d281e0d909c4b9c842d967421edae11.png
🔍 Evaluating: 8d281e0d909c4b9c842d967421edae11.png
   📊 Classified as: Excel-Tabelle (70.50%)
   🎯 Reference: b3444b458ad24811833429eb7c7dacb8.png
   Prüfe Bildqualität: 8d281e0d909c4b9c842d967421edae11.png
   ✅ Bildqualität OK: Das Bild zeigt eine klare und lesbare Tabelle mit finanziellen Daten, die für eine Bewertung geeignet sind. Es gibt keine UI-Elemente, die den Inhalt stören, und die Qualität ist ausreichend hoch.
   📋 Using category-specific prompt for: Excel-Tabelle
   Analysiere Bild: 8d281e0d909c4b9c842d967421edae11.png
   Analysiere Bild: b3444b458ad24811833429eb7c7dacb8.png
   Vergleiche die beiden Analysen...
   📋 Using category-specific prompt for: Excel-Tabelle
   ⚠️ Incomplete evaluation response
   Available keys: ['struktur_qualitaet', 'v

{'filename': '8d281e0d909c4b9c842d967421edae11.png',
 'image_path': 'C:\\Users\\egese\\Desktop\\dataset\\val\\SAP\\8d281e0d909c4b9c842d967421edae11.png',
 'classification': {'category': 'Excel-Tabelle',
  'confidence': 0.7050408124923706,
  'status': 'success'},
 'reference_used': {'filename': 'b3444b458ad24811833429eb7c7dacb8.png',
  'path': 'C:\\Users\\egese\\Desktop\\dataset\\mapped_train\\Excel-Tabelle\\b3444b458ad24811833429eb7c7dacb8.png',
  'details': {'image_path': 'C:\\Users\\egese\\Desktop\\dataset\\mapped_train\\Excel-Tabelle\\b3444b458ad24811833429eb7c7dacb8.png',
   'filename': 'b3444b458ad24811833429eb7c7dacb8.png',
   'category': 'Excel-Tabelle',
   'predicted_class': 'Excel-Tabelle',
   'confidence': 0.9967940449714661,
   'category_match': True,
   'dimensions': '1920x1080',
   'file_size_mb': 0.2,
   'format': 'PNG',
   'quality_score': 5,
   'confidence_score': 9,
   'total_score': 7.5,
   'evaluation_date': '2025-06-22T20:32:25.434880'},
  'category_path': 'Excel-Ta