In [9]:
import os
import json
from datetime import datetime
def consolidate_all_batches():
    """Konsolidiert alle Batch-Daten in eine gemeinsame Datei"""
    batch_dir = "final_data/batches"
    output_file = "final_data/neo4j_data_politicians_enriched.json"
    
    # Alle Batch-Dateien laden, aber batch_structure.json ausfiltern
    batch_files = [f for f in os.listdir(batch_dir) 
                   if f.startswith('batch_') and f.endswith('.json') 
                   and f != 'batch_structure.json']  # NEU: batch_structure ausfiltern
    batch_files.sort()
    
    print(f"📦 Konsolidiere {len(batch_files)} Batch-Dateien...")
    
    
    # Aggregierte Metadaten
    consolidated_data = {
        "metadata": {
            "total_batches": len(batch_files),
            "consolidation_timestamp": datetime.now().isoformat(),
            "total_successful_processing": 0,
            "total_failed_processing": 0,
            "total_embedding_time": 0,
            "total_llm_time": 0,
            "total_embedding_costs": 0,
            "total_llm_costs": 0,
            "total_batch_duration": 0
        },
        "results": []
    }
    
    # Durch alle Batches iterieren
    for i, batch_file in enumerate(batch_files):
        try:
            print(f"📁 Verarbeite {batch_file}...")
            
            with open(os.path.join(batch_dir, batch_file), 'r') as f:
                batch_data = json.load(f)
            
            print(f"   Typ: {type(batch_data)}")
            print(f"   Keys: {list(batch_data.keys())}")
            
            # Metadaten aggregieren
            consolidated_data["metadata"]["total_successful_processing"] += batch_data["successful_processing"]
            consolidated_data["metadata"]["total_failed_processing"] += batch_data["failed_processing"]
            consolidated_data["metadata"]["total_embedding_time"] += batch_data["total_embedding_time"]
            consolidated_data["metadata"]["total_llm_time"] += batch_data["total_llm_time"]
            consolidated_data["metadata"]["total_embedding_costs"] += batch_data["total_embedding_costs"]
            consolidated_data["metadata"]["total_llm_costs"] += batch_data["total_llm_costs"]
            consolidated_data["metadata"]["total_batch_duration"] += batch_data["batch_duration"]
            
            # Alle Ergebnisse in die gemeinsame Liste
            consolidated_data["results"].extend(batch_data["results"])
            
            print(f"✅ Batch {batch_data['batch_num']}: {batch_data['successful_processing']} Ergebnisse hinzugefügt")
            
        except Exception as e:
            print(f"❌ Fehler bei {batch_file}: {e}")
            print(f"   Typ: {type(e).__name__}")
            continue
    
    # Gesamtkosten berechnen
    total_costs = consolidated_data["metadata"]["total_embedding_costs"] + consolidated_data["metadata"]["total_llm_costs"]
    
    print(f"\n KONSOLIDIERUNG ABGESCHLOSSEN:")
    print(f"   Gesamt Ergebnisse: {len(consolidated_data['results'])}")
    print(f"   Erfolgreich: {consolidated_data['metadata']['total_successful_processing']}")
    print(f"   Fehler: {consolidated_data['metadata']['total_failed_processing']}")
    print(f"   Gesamtkosten: {total_costs:.4f}€")
    print(f"   Gesamtzeit: {consolidated_data['metadata']['total_batch_duration']:.1f} Sekunden")
    
    # In Datei speichern
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(consolidated_data, f, ensure_ascii=False, indent=2)
    
    print(f"💾 Konsolidierte Daten gespeichert: {output_file}")
    
    return consolidated_data

# Ausführen
consolidated_data = consolidate_all_batches()

📦 Konsolidiere 45 Batch-Dateien...
📁 Verarbeite batch_000.json...
   Typ: <class 'dict'>
   Keys: ['batch_num', 'batch_size', 'successful_processing', 'failed_processing', 'batch_start_time', 'batch_duration', 'total_embedding_time', 'total_llm_time', 'total_llm_costs', 'total_embedding_costs', 'results', 'errors']
✅ Batch 0: 100 Ergebnisse hinzugefügt
📁 Verarbeite batch_001.json...
   Typ: <class 'dict'>
   Keys: ['batch_num', 'batch_size', 'successful_processing', 'failed_processing', 'batch_start_time', 'batch_duration', 'total_embedding_time', 'total_llm_time', 'total_llm_costs', 'total_embedding_costs', 'results', 'errors']
✅ Batch 1: 100 Ergebnisse hinzugefügt
📁 Verarbeite batch_002.json...
   Typ: <class 'dict'>
   Keys: ['batch_num', 'batch_size', 'successful_processing', 'failed_processing', 'batch_start_time', 'batch_duration', 'total_embedding_time', 'total_llm_time', 'total_llm_costs', 'total_embedding_costs', 'results', 'errors']
✅ Batch 2: 100 Ergebnisse hinzugefügt
📁 Ver

In [10]:
from neo4j import GraphDatabase
import json
from datetime import datetime

class Neo4jClassifier:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def close(self):
        self.driver.close()
    
    def push_classifications_to_neo4j(self, classifications_file="final_data/neo4j_data_politicians_enriched.json"):
        """Fügt alle Klassifikationen in Neo4j ein"""
        
        # Klassifikationen laden
        with open(classifications_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        classifications = data['results']
        print(f"📊 Lade {len(classifications)} Klassifikationen...")
        
        # Daten für Neo4j vorbereiten
        neo4j_data = []
        for classification in classifications:
            neo4j_data.append({
                'neo4j_element_id': classification['neo4j_element_id'],
                'dqr_level': classification['dqr_predict'],
                'comment': classification['comment_predict'],
                'confidence': classification['confidence_score']
            })
        
        print(f"🚀 Starte Neo4j-Import...")
        
        # Batch-INSERT Query
        query = """
        UNWIND $classifications AS classification
        MATCH (p:Politician)
        WHERE elementId(p) = classification.neo4j_element_id
        CREATE (c:Classification {
            dqr_level: classification.dqr_level,
            comment: classification.comment,
            confidence: classification.confidence
        })
        CREATE (p)-[:HAS_CLASSIFICATION]->(c)
        RETURN count(c) as created_nodes
        """
        
        try:
            with self.driver.session() as session:
                start_time = datetime.now()
                
                result = session.run(query, classifications=neo4j_data)
                created_count = result.single()['created_nodes']
                
                duration = (datetime.now() - start_time).total_seconds()
                
                print(f"✅ Import erfolgreich!")
                print(f"   Erstellte Nodes: {created_count}")
                print(f"   Dauer: {duration:.1f} Sekunden")
                print(f"   Durchschnitt: {duration/len(classifications)*1000:.1f} ms pro Klassifikation")
                
                return created_count
                
        except Exception as e:
            print(f"❌ Fehler beim Import: {e}")
            raise
    
    def verify_import(self):
        """Überprüft ob der Import erfolgreich war"""
        query = """
        MATCH (p:Politician)-[:HAS_CLASSIFICATION]->(c:Classification)
        RETURN count(c) as total_classifications,
               count(DISTINCT p) as politicians_with_classification
        """
        
        try:
            with self.driver.session() as session:
                result = session.run(query)
                stats = result.single()
                
                print(f"🔍 VERIFIKATION:")
                print(f"   Gesamt Klassifikationen: {stats['total_classifications']}")
                print(f"   Politiker mit Klassifikation: {stats['politicians_with_classification']}")
                
                return stats
                
        except Exception as e:
            print(f"❌ Fehler bei Verifikation: {e}")
            raise


# Neo4j-Verbindung
classifier = Neo4jClassifier(
    uri="bolt://localhost:7687",
    user="neo4j", 
    password="bundestag_password"
)

try:
    # Klassifikationen einfügen
    created_count = classifier.push_classifications_to_neo4j()
    
    # Verifikation
    classifier.verify_import()
    
finally:
    classifier.close()
    print("🔒 Neo4j-Verbindung geschlossen")

📊 Lade 4454 Klassifikationen...
🚀 Starte Neo4j-Import...
✅ Import erfolgreich!
   Erstellte Nodes: 4454
   Dauer: 1.2 Sekunden
   Durchschnitt: 0.3 ms pro Klassifikation
🔍 VERIFIKATION:
   Gesamt Klassifikationen: 4454
   Politiker mit Klassifikation: 4454
🔒 Neo4j-Verbindung geschlossen
