In [None]:
import os
import json
import base64
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from datetime import datetime
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

# Local imports
from predict import predict_image
from prompt_templates import prompt_templates, comparison_prompt

print("✅ Aufgabe 4 Final Evaluation System")
print("🎯 Ready to test complete workflow!")

# Setup paths
VAL_DIR = r"C:\Users\egese\Desktop\dataset\val\SAP"
RESULTS_DIR = "results/validation_analysis/"
REFERENCE_DIR = "results/reference_solutions/"
FINAL_OUTPUT = "results/aufgabe4_final/"

# Create output directory
os.makedirs(FINAL_OUTPUT, exist_ok=True)

print(f"📁 Validation images: {VAL_DIR}")
print(f"📊 Analysis results: {RESULTS_DIR}")
print(f"🎯 Reference solutions: {REFERENCE_DIR}")
print(f"🏆 Final output: {FINAL_OUTPUT}")


In [None]:
class Aufgabe4EvaluationSystem:
    def __init__(self, api_key):
        """Complete evaluation system for Aufgabe 4"""
        self.api_key = api_key
        self.setup_claude_api()
        
        # Load reference solutions
        self.reference_solutions = {}
        self.load_reference_database()
        
        # Evaluation results
        self.evaluation_results = []
        
    def setup_claude_api(self):
        """Setup OpenRouter Claude API"""
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        self.model = "anthropic/claude-3-sonnet:beta"
        print("✅ Claude API ready for evaluation")
    
    def load_reference_database(self):
        """Load reference solutions from JSON files"""
        if os.path.exists(REFERENCE_DIR):
            ref_files = [f for f in os.listdir(REFERENCE_DIR) if f.startswith('reference_solutions_')]
            if ref_files:
                latest_ref = sorted(ref_files)[-1]
                ref_path = os.path.join(REFERENCE_DIR, latest_ref)
                
                with open(ref_path, 'r', encoding='utf-8') as f:
                    ref_data = json.load(f)
                    self.reference_solutions = ref_data.get('references', {})
                
                print(f"✅ Loaded reference solutions: {len(self.reference_solutions)} categories")
                for category, refs in self.reference_solutions.items():
                    print(f"   📁 {category}: {len(refs)} reference(s)")
            else:
                print("⚠️ No reference solutions found!")
        else:
            print("❌ Reference directory not found!")
    
    def classify_student_submission(self, image_path):
        """CNN classification of student submission"""
        try:
            predicted_class, confidence = predict_image(image_path)
            return {
                "category": predicted_class,
                "confidence": float(confidence),
                "status": "success"
            }
        except Exception as e:
            return {
                "category": None,
                "confidence": 0.0,
                "status": "error",
                "error": str(e)
            }
    
    def get_best_reference(self, category):
        """Get best reference for a category"""
        if category not in self.reference_solutions:
            return None
        
        references = self.reference_solutions[category]
        if not references:
            return None
        
        # Get first (best) reference
        best_ref = references[0]
        ref_filename = best_ref['filename']
        ref_path = os.path.join(VAL_DIR, ref_filename)
        
        if os.path.exists(ref_path):
            return {
                "filename": ref_filename,
                "path": ref_path,
                "details": best_ref
            }
        return None
    
    def encode_image(self, image_path):
        """Encode image to base64"""
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            print(f"❌ Image encoding error: {e}")
            return None
    
    def compare_with_reference(self, student_path, reference_info, category):
        """Compare student submission with reference using Claude"""
        student_b64 = self.encode_image(student_path)
        reference_b64 = self.encode_image(reference_info['path'])
        
        if not student_b64 or not reference_b64:
            return {"error": "Failed to encode images"}
        
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
            "HTTP-Referer": "http://localhost:8888",
            "X-Title": "Aufgabe 4 Evaluation"
        }
        
        # Build comparison prompt
        prompt = f"""
Vergleiche das Student-Bild mit der Musterlösung in der Kategorie {category}.

BEWERTUNGSKRITERIEN:
1. Kategorie-Zuordnung (0-25 Punkte)
2. Technische Qualität (0-25 Punkte) 
3. Vollständigkeit (0-25 Punkte)
4. SAP BW Konformität (0-25 Punkte)

Gib eine detaillierte Bewertung als JSON zurück:
{{
  "kategorie_bewertung": {{
    "richtige_kategorie": true/false,
    "sicherheit": 0-100,
    "punkte": 0-25,
    "erklaerung": "Begründung"
  }},
  "technische_qualitaet": {{
    "struktur": 0-10,
    "lesbarkeit": 0-10,
    "vollstaendigkeit": 0-5,
    "punkte": 0-25,
    "kommentar": "Detailbewertung"
  }},
  "inhaltliche_bewertung": {{
    "fachliche_korrektheit": 0-15,
    "business_kontext": 0-10,
    "punkte": 0-25,
    "feedback": "Inhaltliches Feedback"
  }},
  "sap_bw_konformitaet": {{
    "standard_konform": 0-15,
    "terminologie": 0-10,
    "punkte": 0-25,
    "anmerkungen": "SAP-spezifische Bewertung"
  }},
  "gesamt_bewertung": {{
    "gesamtpunkte": 0-100,
    "note": 1.0-5.0,
    "bestanden": true/false,
    "feedback": "Zusammenfassendes Feedback für Student"
  }},
  "verbesserungsvorschlaege": [
    "Konkrete Verbesserungshinweise"
  ]
}}
"""
        
        data = {
            "model": self.model,
            "max_tokens": 2500,
            "temperature": 0.1,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "**STUDENT SUBMISSION:**"},
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{student_b64}"}
                        },
                        {"type": "text", "text": "**MUSTERLÖSUNG (REFERENCE):**"},
                        {
                            "type": "image_url", 
                            "image_url": {"url": f"data:image/jpeg;base64,{reference_b64}"}
                        },
                        {"type": "text", "text": prompt}
                    ]
                }
            ]
        }
        
        try:
            response = requests.post(self.base_url, headers=headers, json=data, timeout=120)
            
            if response.status_code == 200:
                result = response.json()
                content = result.get('choices', [{}])[0].get('message', {}).get('content', '')
                return {"response": content}
            else:
                return {"error": f"HTTP {response.status_code}: {response.text}"}
                
        except Exception as e:
            return {"error": f"Request failed: {str(e)}"}
    
    def parse_evaluation_response(self, response):
        """Parse Claude evaluation response to JSON"""
        try:
            if "error" in response:
                return response
            
            text_output = response.get('response', '')
            
            # Extract JSON
            start_idx = text_output.find('{')
            end_idx = text_output.rfind('}') + 1
            
            if start_idx != -1 and end_idx > start_idx:
                json_str = text_output[start_idx:end_idx]
                parsed = json.loads(json_str)
                return parsed
            else:
                return {
                    "error": "JSON not found in response",
                    "raw_text": text_output[:500]
                }
                
        except json.JSONDecodeError as e:
            return {
                "error": f"JSON parse error: {str(e)}",
                "raw_response": str(response.get('response', ''))[:500]
            }
    
    def evaluate_student_submission(self, student_image_path, student_filename):
        """Complete evaluation of a single student submission"""
        print(f"🔍 Evaluating: {student_filename}")
        
        # Step 1: CNN Classification
        classification = self.classify_student_submission(student_image_path)
        if classification['status'] != 'success':
            return {
                "filename": student_filename,
                "status": "error",
                "error": "CNN classification failed",
                "classification": classification
            }
        
        category = classification['category']
        print(f"   📊 Classified as: {category} ({classification['confidence']:.2%})")
        
        # Step 2: Get reference solution
        reference = self.get_best_reference(category)
        if not reference:
            return {
                "filename": student_filename,
                "status": "error", 
                "error": f"No reference solution for category {category}",
                "classification": classification
            }
        
        print(f"   🎯 Reference: {reference['filename']}")
        
        # Step 3: Claude comparison
        comparison = self.compare_with_reference(student_image_path, reference, category)
        evaluation = self.parse_evaluation_response(comparison)
        
        if "error" in evaluation:
            print(f"   ❌ Claude evaluation failed: {evaluation['error']}")
        else:
            if 'gesamt_bewertung' in evaluation:
                score = evaluation['gesamt_bewertung'].get('gesamtpunkte', 0)
                grade = evaluation['gesamt_bewertung'].get('note', 5.0)
                print(f"   ✅ Score: {score}/100, Grade: {grade}")
            else:
                print(f"   ⚠️ Incomplete evaluation response")
        
        # Compile final result
        result = {
            "filename": student_filename,
            "image_path": student_image_path,
            "classification": classification,
            "reference_used": reference,
            "evaluation": evaluation,
            "timestamp": datetime.now().isoformat(),
            "status": "success" if "error" not in evaluation else "partial"
        }
        
        return result

# Initialize evaluation system
API_KEY = "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"

try:
    evaluator = Aufgabe4EvaluationSystem(API_KEY)
    print("\n🎓 Aufgabe 4 Evaluation System ready!")
    print(f"📊 Categories with references: {len(evaluator.reference_solutions)}")
except Exception as e:
    print(f"❌ Evaluation system setup failed: {e}")
    evaluator = None
