In [4]:
from neo4j import GraphDatabase
import pandas as pd
import json

# Verbindung zur Neo4j-DB
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "bundestag_password"))  

In [5]:
# Test: Politiker-Nodes finden
with driver.session() as session:
    result = session.run("MATCH (p:Politician) RETURN p LIMIT 5")
    for record in result:
        print(record["p"])

<Node element_id='4:efc8ecef-1ae9-4ae6-a5c6-3ab222bbb2a5:44' labels=frozenset({'Politician'}) properties={'detail_page': 'https://de.wikipedia.org/wiki/Gerhart_Baum', 'firstname': 'Gerhart', 'full_name': 'Gerhart Baum', 'death_year': '2025', 'birth_year': '1932', 'lastname': 'Baum'}>
<Node element_id='4:efc8ecef-1ae9-4ae6-a5c6-3ab222bbb2a5:49' labels=frozenset({'Politician'}) properties={'detail_page': 'https://de.wikipedia.org/wiki/Hans_Bardens', 'firstname': 'Hans', 'full_name': 'Hans Bardens', 'death_year': '2003', 'lastname': 'Bardens', 'birth_year': '1927'}>
<Node element_id='4:efc8ecef-1ae9-4ae6-a5c6-3ab222bbb2a5:55' labels=frozenset({'Politician'}) properties={'detail_page': 'https://de.wikipedia.org/wiki/Willy_Bartsch', 'firstname': 'Willy', 'full_name': 'Willy Bartsch', 'death_year': '1988', 'lastname': 'Bartsch', 'birth_year': '1905'}>
<Node element_id='4:efc8ecef-1ae9-4ae6-a5c6-3ab222bbb2a5:59' labels=frozenset({'Politician'}) properties={'detail_page': 'https://de.wikipedia

In [9]:
# Test: Politiker mit Content-Nodes
with driver.session() as session:
    result = session.run("""
        MATCH (p:Politician)-[:HAS_CONTENT]->(c)
        RETURN p.full_name, c.section_header, c.section_content
        LIMIT 10
    """)
    for record in result:
        print(f"Politiker: {record['p.full_name']}")
        print(f"Header: {record['c.section_header']}")
        print(f"Content: {record['c.section_content'][:100]}...")
        print("-" * 50)

Politiker: Gerhart Baum
Header: Einzelnachweise
Content: ↑ Susanne Führer: Gerhart Baum und Burkhard Hirsch. In: deutschlandfunkkultur.de. 21. März 2016, abg...
--------------------------------------------------
Politiker: Gerhart Baum
Header: Weblinks
Content: Commons: Gerhart Baum – Sammlung von Bildern
Wikinews: Gerhart Baum – in den Nachrichten
Wikiquote: ...
--------------------------------------------------
Politiker: Gerhart Baum
Header: Literatur
Content: Sabine Leutheusser-Schnarrenberger (Hrsg.): In liberaler Mission. Gerhart Baum und die deutsche Demo...
--------------------------------------------------
Politiker: Gerhart Baum
Header: Veröffentlichungen (Auswahl)
Content: mit Jochen Bölsche (Hrsg.): Der Weg in den Überwachungsstaat. Mit neuen Dokumenten und Stellungnahme...
--------------------------------------------------
Politiker: Gerhart Baum
Header: Filme
Content: alpha Forum. Gerhart Rudolf Baum. Gespräch, Deutschland, 2009, 43 Min., Moderation: Susanne Zimmer, ...
-

In [8]:
# Minister-Daten laden
with open("minister.json", "r") as f:
    ministers = [json.loads(line) for line in f]

print(f"Anzahl Minister: {len(ministers)}")
print(f"Erste Minister: {ministers[0]['Vorname']} {ministers[0]['Nachname']}")

Anzahl Minister: 195
Erste Minister: Joschka Fischer


In [23]:
def enrich_ministers_with_content(ministers, driver):
    """Reichert Minister-JSON mit Neo4j Content an"""
    
    enriched_ministers = []
    failed_ministers = []
    
    for i, minister in enumerate(ministers):
        try:
            print(f"Processing {i+1}/{len(ministers)}: {minister['Vorname']} {minister['Nachname']}")
            
            # Politiker UND Content in EINER Query finden
            politician_with_content = find_politician_with_content(
                minister["Vorname"], 
                minister["Nachname"], 
                driver
            )
            
            if politician_with_content:
                # Content-Nodes extrahieren
                content_nodes = politician_with_content["content"]
                
                # Minister-JSON erweitern (nur bei Erfolg!)
                minister["neo4j_content"] = content_nodes
                minister["neo4j_matched"] = True
                minister["neo4j_politician_id"] = politician_with_content["politician"].get("id")
                minister["content_count"] = len(content_nodes)
                
                # NUR erfolgreiche hinzufügen!
                enriched_ministers.append(minister)
                print(f"✅ Matched: {len(content_nodes)} content nodes")
                
            else:
                # Bei keinem Match: NUR in failed_ministers!
                failed_ministers.append(minister)
                print(f"❌ No match found")
                
        except Exception as e:
            print(f"❌ Error processing {minister['Vorname']} {minister['Nachname']}: {str(e)}")
            
            # Bei Fehler: NUR in failed_ministers!
            failed_ministers.append(minister)
    
    return enriched_ministers, failed_ministers

def find_politician_with_content(first_name, last_name, driver):
    """Findet Politiker UND Content in EINER Query"""
    with driver.session() as session:
        # Verschiedene Namensvarianten probieren
        name_variants = [
            (first_name, last_name),
            (first_name.strip(), last_name.strip()),
            (first_name.lower(), last_name.lower()),
            (first_name.title(), last_name.title())
        ]
        
        for fname, lname in name_variants:
            result = session.run("""
                MATCH (p:Politician)-[:HAS_CONTENT]->(c)
                WHERE (p.firstname = $first_name AND p.lastname = $last_name)
                   OR (p.full_name CONTAINS $first_name AND p.full_name CONTAINS $last_name)
                   OR (p.full_name = $full_name)
                RETURN p, collect({
                    section_header: c.section_header,
                    section_content: c.section_content,
                    content_id: c.id
                }) as content
                LIMIT 1
            """, first_name=fname, last_name=lname, full_name=f"{fname} {lname}")
            
            record = result.single()
            if record:
                return {
                    "politician": record["p"],
                    "content": record["content"]
                }
        
        return None

In [24]:
# Minister laden
with open("minister.json", "r") as f:
    ministers = [json.loads(line) for line in f]

# Mit Content anreichern
enriched_ministers, failed_ministers = enrich_ministers_with_content(ministers, driver)

# Erfolgreiche speichern
with open("minister_with_neo4j_content.json", "w") as f:
    json.dump(enriched_ministers, f, indent=2, ensure_ascii=False)

# Fehlgeschlagene separat speichern
with open("minister_failed_matching.json", "w") as f:
    json.dump(failed_ministers, f, indent=2, ensure_ascii=False)

# Statistiken
matched = sum(1 for m in enriched_ministers if m["neo4j_matched"])
failed = len(failed_ministers)
print(f"✅ Erfolgreich gematched: {matched}/{len(ministers)}")
print(f"❌ Fehlgeschlagen: {failed}")
print(f"📊 Erfolgsrate: {matched/len(ministers)*100:.1f}%")

Processing 1/195: Joschka Fischer
✅ Matched: 9 content nodes
Processing 2/195: Hans Klein
✅ Matched: 11 content nodes
Processing 3/195: Alex Möller
✅ Matched: 7 content nodes
Processing 4/195: Walter  Arendt
❌ No match found
Processing 5/195: Egon  Bahr
❌ No match found
Processing 6/195: Theodor Blank
✅ Matched: 6 content nodes
Processing 7/195: Kurt Bodewig
✅ Matched: 10 content nodes
Processing 8/195: Willy Brandt
✅ Matched: 11 content nodes
Processing 9/195: Aenne Brauksiepe
✅ Matched: 10 content nodes
Processing 10/195: Egon Franke
✅ Matched: 13 content nodes
Processing 11/195: Antje Huber
✅ Matched: 5 content nodes
Processing 12/195: Hans Katzer
✅ Matched: 10 content nodes
Processing 13/195: Georg Leber
✅ Matched: 9 content nodes
Processing 14/195: Paul Lücke
✅ Matched: 11 content nodes
Processing 15/195: Franz Müntefering
✅ Matched: 11 content nodes
Processing 16/195: Karl Ravens
✅ Matched: 9 content nodes
Processing 17/195: Hannelore Rönsch
✅ Matched: 10 content nodes
Processing

In [25]:


# JSON laden
with open("minister_with_neo4j_content.json", "r") as f:
    ministers = json.load(f)

# Anzahl zählen
print(f"Anzahl Minister: {len(ministers)}")

Anzahl Minister: 165


In [1]:
import json

def filter_education_content(input_file='minister_with_neo4j_content.json', output_file='minister_education_filtered.json'):
    """
    Filtert die JSON-Datei und entfernt die häufigsten Felder, die keine Bildungsinformationen enthalten
    """
    # JSON laden
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Felder die gefiltert werden sollen (häufigste, keine Bildungsinfos)
    fields_to_filter = {
        '#',  # Root-Element - 165x
        'Weblinks',  # 163x
        'Einzelnachweise',  # 158x
        'Literatur',  # 114x
        'Abgeordneter',  # 46x
        'Abgeordnete',  # 13x
        'Siehe auch'  # 41x
    }
    
    # Gefilterte Daten erstellen
    filtered_data = []
    total_sections_before = 0
    total_sections_after = 0
    
    for person in data:
        person_copy = person.copy()
        
        # neo4j_content filtern
        if 'neo4j_content' in person_copy:
            original_content = person_copy['neo4j_content']
            total_sections_before += len(original_content)
            
            # Nur Sections behalten, die nicht in der Filter-Liste stehen
            filtered_content = []
            for content in original_content:
                if content.get('section_header') not in fields_to_filter:
                    filtered_content.append(content)
            
            person_copy['neo4j_content'] = filtered_content
            total_sections_after += len(filtered_content)
        
        filtered_data.append(person_copy)
    
    # Gefilterte JSON speichern
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered_data, f, ensure_ascii=False, indent=2)
    
    # Statistiken ausgeben
    print(f"=== FILTERUNG ERFOLGREICH ===")
    print(f"Gefilterte JSON gespeichert als: {output_file}")
    print(f"Anzahl Personen: {len(data)}")
    print(f"Gefilterte Felder: {', '.join(fields_to_filter)}")
    print(f"Section-Headers vor Filterung: {total_sections_before}")
    print(f"Section-Headers nach Filterung: {total_sections_after}")
    print(f"Entfernte Sections: {total_sections_before - total_sections_after}")
    print(f"Reduktion: {((total_sections_before - total_sections_after) / total_sections_before * 100):.1f}%")
    
    # Verbleibende Section-Headers analysieren
    remaining_headers = {}
    for person in filtered_data:
        if 'neo4j_content' in person:
            for content in person['neo4j_content']:
                header = content.get('section_header', '')
                remaining_headers[header] = remaining_headers.get(header, 0) + 1
    
    print(f"\n=== VERBLEIBENDE SECTION-HEADERS (Top 20) ===")
    sorted_headers = sorted(remaining_headers.items(), key=lambda x: x[1], reverse=True)
    for header, count in sorted_headers[:20]:
        percentage = (count / len(data)) * 100
        print(f"  {header:<40} | {count:>3}x | {percentage:>5.1f}% der Personen")


filter_education_content()

=== FILTERUNG ERFOLGREICH ===
Gefilterte JSON gespeichert als: minister_education_filtered.json
Anzahl Personen: 165
Gefilterte Felder: Einzelnachweise, #, Siehe auch, Weblinks, Abgeordnete, Literatur, Abgeordneter
Section-Headers vor Filterung: 1612
Section-Headers nach Filterung: 912
Entfernte Sections: 700
Reduktion: 43.4%

=== VERBLEIBENDE SECTION-HEADERS (Top 20) ===
  Öffentliche Ämter                        |  62x |  37.6% der Personen
  Leben                                    |  58x |  35.2% der Personen
  Ehrungen                                 |  58x |  35.2% der Personen
  Partei                                   |  55x |  33.3% der Personen
  Veröffentlichungen                       |  50x |  30.3% der Personen
  Kabinette                                |  45x |  27.3% der Personen
  Leben und Beruf                          |  42x |  25.5% der Personen
  Auszeichnungen                           |  42x |  25.5% der Personen
  Ausbildung und Beruf                     |  30x