In [73]:
# Text Extraction: Final Version with Doc ID

import pymupdf
import re
import csv
from pathlib import Path
from multi_column import column_boxes


def clean_text(text: str) -> str:
    """
    Bereinigt Text: Fixiert Silbentrennung, Initialen und Whitespace.
    """

    if not text:
        return ""

    # 1. Fix Hyphenation (Word-\npart -> Wordpart)
    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)

    # 2. Fix Drop Caps (A\ndvances -> Advances)
    text = re.sub(r"^\s*([A-Z])\s*\n\s*([a-z])", r"\1\2", text)

    # 3. Collapse whitespace
    return re.sub(r"\s+", " ", text).strip()

def process_folder(folder_path: str, output_csv: str):
    """
    Iterates through all PDFs in folder_path, extracts metadata from filenames,
    extracts/cleans text from pages, and writes everything to a single CSV.
    """

    pdf_dir = Path(folder_path)
    
    # Check if directory exists
    if not pdf_dir.exists():
        print(f"Directory not found: {folder_path}")
        return

    print(f"Scanning folder: {folder_path}...")

    # Open CSV file once for writing
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)

        # Update Header: Added "Doc_ID" as the first column
        writer.writerow(["Doc_ID", "Country", "Year", "Document_Name", "Page", "Block_ID", "text"])

        # Iterate over all PDF files in the directory
        pdf_files = list(pdf_dir.glob("*.pdf"))

        if not pdf_files:
            print("No PDF files found in the directory.")
            return

        # Enumerate gives us a counter (doc_id) starting at 0
        for doc_id, pdf_path in enumerate(pdf_files, start=0):

            # --- Metadata Extraction ---
            parts = pdf_path.stem.split("_")
            country = parts[0]

            # Check if second part is a year (digits)
            year = parts[1] if len(parts) > 1 and parts[1].isdigit() else ""

            # Join the rest as the document name
            doc_name = "_".join(parts[2:]) if len(parts) > 2 else ""

            print(f"Processing ID {doc_id}: {pdf_path.name} | Country: {country}, Year: {year}")

            try:

                # --- PDF Text Extraction ---
                doc = pymupdf.open(pdf_path)

                for page_num, page in enumerate(doc, start=1):

                    # Column detection (Bounding Boxes)
                    bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

                    for block_id, rect in enumerate(bboxes, start=1):

                        # Extract text from the specific box
                        raw_text = page.get_text(clip=rect, sort=True)

                        # Clean text
                        final_text = clean_text(raw_text)

                        # Write to CSV if text exists
                        if final_text:
                            # Added doc_id to the row data
                            writer.writerow([doc_id, country, year, doc_name, page_num, block_id, final_text])

                doc.close()

            except Exception as e:
                print(f"Failed to process {pdf_path.name}: {e}")


    print(f"Extraction complete. All data saved in '{output_csv}'.")


# --- Main Execution ---
if __name__ == "__main__":

    # Define your folder and output filename here
    input_folder = "countries_edited"
    output_filename = "text_countries_edited.csv"

    process_folder(input_folder, output_filename)

Scanning folder: countries_edited...
Processing ID 0: CAN_2022_Canada's National Quantum Strategy.pdf | Country: CAN, Year: 2022
Processing ID 1: GER_2023_Quantum Technologies Conceptual Framework Programme.pdf | Country: GER, Year: 2023
Processing ID 2: UK_2023_National Quantum Strategy.pdf | Country: UK, Year: 2023
Processing ID 3: USA_2018_NATIONAL STRATEGIC  OVERVIEW FOR QUANTUM  INFORMATION SCIENCE.pdf | Country: USA, Year: 2018
Extraction complete. All data saved in 'text_countries_edited.csv'.


In [74]:
# Load csv to df
import pandas as pd
output_filename = "text_countries_edited.csv"
df = pd.read_csv(output_filename)
print(df.head())

   Doc_ID Country  Year                       Document_Name  Page  Block_ID  \
0       0     CAN  2022  Canada's National Quantum Strategy     2         1   
1       0     CAN  2022  Canada's National Quantum Strategy     3         1   
2       0     CAN  2022  Canada's National Quantum Strategy     3         2   
3       0     CAN  2022  Canada's National Quantum Strategy     4         1   
4       0     CAN  2022  Canada's National Quantum Strategy     4         2   

                                                text  
0  Executive summary dvances in quantum science h...  
1  Three key missions The National Quantum Strate...  
2  Next steps To strengthen Canada‚Äôs quantum ecos...  
3                          Canada: A quantum pioneer  
4  ince the birth of quantum science more than 10...  


In [None]:
import pandas as pd
import networkx as nx
from itertools import combinations

# 1. LOAD DATA
# Load the file and force columns to be strings to avoid the "bool" error
df = pd.read_csv("entities_to_edit.csv", dtype=str).fillna("")

# --- CONFIGURATION ---
# We use Block_ID to ensure they appear in the exact same paragraph (Contextual Co-occurrence)
scope = ['Doc_ID', 'Block_ID'] 
# ---------------------

print("--- Step 1: Pre-processing & Cleaning ---")

# A. CLEANING FUNCTION
def clean_name(name):
    name = str(name).strip()
    # filtering out empty strings or accidental booleans
    if name.lower() in ["", "true", "false", "nan"]:
        return None
    return name

# Apply cleaning to the FINALIZED column
df['clean_name'] = df['finalized_entity_name'].apply(clean_name)

# Remove rows where we don't have a valid name
df_clean = df.dropna(subset=['clean_name'])

print(f"Original rows: {len(df)} -> Cleaned rows: {len(df_clean)}")

# =========================================================
# LOGIC 1: DETECTING RELATIONSHIPS (Building the Graph)
# =========================================================
print("\n--- Step 2: Building the Network Graph ---")

# Group by Document + Block (Paragraph)
grouped = df_clean.groupby(scope)['clean_name'].unique()

G = nx.Graph()

# Build a lookup dictionary for Categories (Name -> QH Category)
# We take the first non-empty category found for each actor
actor_cats = df_clean.groupby('clean_name')['qh_category'].first()
nx.set_node_attributes(G, actor_cats.to_dict(), "category")

# Iterate through groups to find links
for actors in grouped:
    if len(actors) < 2:
        continue
    
    # Create links between every pair in this block
    for u, v in combinations(sorted(actors), 2):
        if G.has_edge(u, v):
            G[u][v]['weight'] += 1
        else:
            G.add_edge(u, v, weight=1)

print(f"Graph Created: {G.number_of_nodes()} Actors, {G.number_of_edges()} Links.")

# =========================================================
# LOGIC 2: CENTRALITY (Who are the Hubs?)
# =========================================================
print("\n--- Step 3: Calculating Centrality (Hubs) ---")

if len(G.nodes) > 0:
    # Degree Centrality: Number of unique connections
    degree_dict = nx.degree_centrality(G)

    node_data = []
    for node in G.nodes():
        node_data.append({
            "Actor_Name": node,
            "QH_Category": G.nodes[node].get("category", "Unknown"),
            "Connections_Count": G.degree(node),  # Raw count of partners
            "Centrality_Score": degree_dict[node] # Normalized score (0-1)
        })

    nodes_df = pd.DataFrame(node_data).sort_values("Connections_Count", ascending=False)
    nodes_df.to_csv("network_nodes_centrality.csv", index=False)
    print("Saved Hub Analysis to 'network_nodes_centrality.csv'")
    print(nodes_df.head(5))
else:
    print("No connections found. Check if 'finalized_entity_name' is populated.")

# =========================================================
# LOGIC 3: TOPIC MAPPING (Sector vs Sector)
# =========================================================
print("\n--- Step 4: Topic Mapping (Category Matrix) ---")

if len(G.edges) > 0:
    cat_interaction_counts = {}

    for u, v, data in G.edges(data=True):
        cat_u = G.nodes[u].get("category", "Unknown")
        cat_v = G.nodes[v].get("category", "Unknown")
        weight = data['weight']
        
        # Sort so (Gov, Industry) is same as (Industry, Gov)
        cats = tuple(sorted([str(cat_u), str(cat_v)]))
        
        if cats in cat_interaction_counts:
            cat_interaction_counts[cats] += weight
        else:
            cat_interaction_counts[cats] = weight

    # Convert to Matrix
    matrix_data = [{"Category_1": k[0], "Category_2": k[1], "Weight": v} for k, v in cat_interaction_counts.items()]
    matrix_df = pd.DataFrame(matrix_data)

    # Mirror for full square matrix
    matrix_mirrored = matrix_df.rename(columns={"Category_1": "Category_2", "Category_2": "Category_1"})
    full_matrix = pd.concat([matrix_df, matrix_mirrored]).drop_duplicates()
    
    # Pivot
    final_matrix = full_matrix.pivot(index="Category_1", columns="Category_2", values="Weight").fillna(0).astype(int)

    final_matrix.to_csv("network_topic_matrix.csv")
    print("Saved Topic Map to 'network_topic_matrix.csv'")
    print(final_matrix)
else:
    print("No edges to map.")

2026-01-09 14:57:46,112 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


Batch inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [03:22<00:00, 22.47s/it] 


Saved 739 entities.
   Doc_ID  Entity_ID Country  Year                       Document_Name  Page  \
0       0          0     CAN  2022  Canada's National Quantum Strategy     2   
1       0          1     CAN  2022  Canada's National Quantum Strategy     3   
2       0          2     CAN  2022  Canada's National Quantum Strategy     3   
3       0          3     CAN  2022  Canada's National Quantum Strategy     3   
4       0          4     CAN  2022  Canada's National Quantum Strategy     4   

   Block_ID                          entity_name ner_label qh_category  \
0         1                             Acountry       ORG               
1         1        The National Quantum Strategy       ORG               
2         1                                  NQS       ORG               
3         1                 Government of Canada       ORG               
4         3  National Research Council of Canada       ORG               

  qh_sub_category qh_exact_category modified_entity_na

In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- 1. CONFIGURATION & DATA LOADING ---
filename = "entities_to_edit.csv"

# --- TAXONOMY (General English) ---
TAXONOMY = {
    "Academia": {
        "Higher Education Institutions (HEIs)": {
            "desc": "Universities, Colleges, Schools (Teaching & Research).",
            "keywords": [
                "University", "College", "School", "Academy", "Faculty", "Department", 
                "Chair", "Campus", "Institute of Technology", "Polytechnic", 
                "Business School", "Medical School", "Law School"
            ]
        },
        "Public Research Orgs (PROs)": {
            "desc": "Research Institutes (Knowledge Output, no teaching).",
            "keywords": [
                "Institute", "Center", "Centre", "Laboratory", "Lab", "Observatory", 
                "National Lab", "Research Council", "Think Tank", "Agency (Research)"
            ]
        }
    },
    "Industry": {
        "Private Firms (Corporates)": {
            "desc": "Established Companies, SMEs, MNEs.",
            "keywords": [
                "Inc", "Corp", "Corporation", "Ltd", "LLC", "PLC", "Co", "Company", 
                "Group", "Holdings", "Manufacturer", "Supplier", "Vendor", 
                "Conglomerate", "Multinational", "Enterprise", "Firm"
            ]
        },
        "Start-ups": {
            "desc": "Young Growth Companies, Spin-offs.",
            "keywords": [
                "Start-up", "Startup", "Spin-off", "Spinoff", "Scale-up", "Unicorn", 
                "Venture", "NewCo", "DeepTech", "Founder", "Stealth Mode"
            ]
        },
        "Consulting": {
            "desc": "Services, Advisory, Legal, HR.",
            "keywords": [
                "Consulting", "Consultancy", "Advisors", "Partners", "Legal", "Law Firm", 
                "LLP", "Attorney", "IP Law", "Patent", "Audit", "Tax", "Recruitment", 
                "Headhunter", "Strategy", "Management", "Services"
            ]
        },
        "Venture Capital / Investors": {
            "desc": "Financial Actors, VCs, Business Angels.",
            "keywords": [
                "Capital", "Invest", "Investment", "Fund", "Venture", "VC", "Equity", 
                "Private Equity", "PE", "Angel", "Seed", "Asset Management", "Bank", 
                "Financial Group", "Holding", "Wealth Management"
            ]
        }
    },
    "Government": {
        "Policy Makers": {
            "desc": "Ministries, Councils, Parliaments (Regulation).",
            "keywords": [
                "Ministry", "Department", "Dept", "Council", "Government", "Federal", 
                "State", "Municipality", "City", "County", "District", "Parliament", 
                "Senate", "Commission", "Mayor", "Governor", "Regulator", "Authority", 
                "Administration", "Bureau"
            ]
        },
        "Funding Agencies": {
            "desc": "Funding Bodies, Project Management Agencies.",
            "keywords": [
                "Foundation", "Agency", "Grant", "Funding", "Fund", "Endowment", 
                "Trust", "Award", "Scholarship", "Fellowship", "Program", "Initiative"
            ]
        }
    },
    "Civil Society": {
        "Media": {
            "desc": "Press, News, Journals.",
            "keywords": [
                "News", "Journal", "Press", "Times", "Post", "Daily", "Review", 
                "Magazine", "Publisher", "Broadcaster", "TV", "Radio", "Podcast", 
                "Blog", "Media", "Outlet", "Chronicle", "Gazette"
            ]
        },
        "Cultural Institutions": {
            "desc": "Museums, Libraries, Galleries.",
            "keywords": [
                "Museum", "Library", "Gallery", "Theater", "Opera", "Orchestra", 
                "Archive", "Collection", "Exhibition", "Zoo", "Botanical Garden", 
                "Science Center", "Planetarium", "Hall"
            ]
        },
        "NGOs / NPOs": {
            "desc": "Non-Profit, Social Goals, Charities.",
            "keywords": [
                "Charity", "Non-Profit", "NPO", "NGO", "Organization", "Society", 
                "Club", "Union", "Alliance", "Federation", "Initiative", "Philanthropy", 
                "Foundation (Private)", "Mission", "Relief"
            ]
        },
        "Intermediaries": {
            "desc": "Clusters, Hubs, TTOs, Chambers.",
            "keywords": [
                "Cluster", "Network", "Hub", "Incubator", "Accelerator", "TTO", 
                "Technology Transfer", "Chamber of Commerce", "Trade Union", 
                "Association", "Consortium", "Standardization", "Body", "Council (Trade)"
            ]
        },
        "Citizens / Users": {
            "desc": "Citizens, Patients, User Groups.",
            "keywords": [
                "Community", "Group", "Public", "Citizen", "Patient", "User", 
                "Resident", "Population", "Crowd", "Forum", "Volunteer", "Advocacy"
            ]
        }
    }
}

try:
    df = pd.read_csv(filename).fillna("")
    required_cols = ["qh_category", "qh_sub_category", "qh_exact_category", "modified_entity_name", "finalized_entity_name"]
    for col in required_cols:
        if col not in df.columns:
            df[col] = ""
except FileNotFoundError:
    print(f"Error: '{filename}' not found.")
    df = pd.DataFrame()

# --- 2. WIDGET SETUP ---

if not df.empty:
    unfinished_indices = df[df['qh_category'] == ""].index.tolist()
    current_idx = unfinished_indices[0] if unfinished_indices else 0
    NEW_OPT = "+++ Create New +++"

    # Progress Bar
    total_items = len(df)
    progress = widgets.IntProgress(
        value=len(df) - len(unfinished_indices),
        min=0,
        max=total_items,
        description='Progress:',
        bar_style='success',
        layout=widgets.Layout(width='99%')
    )
    progress_label = widgets.Label(value=f"{progress.value} / {total_items} tagged")

    # --- INPUT WIDGETS ---

    # 0. Entity Name Editor
    w_name_edit = widgets.Text(
        description='<b>Edit Name:</b>',
        placeholder='Correct the entity name here...',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='600px')
    )

    # 1. Main Category
    w_helix = widgets.Dropdown(
        options=[''] + list(TAXONOMY.keys()) + [NEW_OPT],
        description='1. Helix:',
        layout=widgets.Layout(width='400px')
    )
    w_helix_new = widgets.Text(
        placeholder='Type new Helix category...',
        layout=widgets.Layout(width='400px', display='none') 
    )

    # 2. Sub Category
    w_sub = widgets.Dropdown(
        options=[],
        description='2. Type:',
        layout=widgets.Layout(width='400px'),
        disabled=True
    )
    w_sub_new = widgets.Text(
        placeholder='Type new Sub-Category...',
        layout=widgets.Layout(width='400px', display='none')
    )

    # 3. Exact Category (MULTI SELECT)
    w_exact_multi = widgets.SelectMultiple(
        options=[],
        description='3. Exact:',
        rows=8, # Height of the box
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )
    
    # Helper text for multi-select
    w_multi_help = widgets.HTML(
        value="<div style='font-size:0.8em; color:#666; margin-left:100px;'><i>Hold <b>Ctrl</b> (Win) or <b>Cmd</b> (Mac) to select multiple.</i></div>"
    )

    w_exact_new = widgets.Text(
        placeholder='Type NEW keyword here...',
        layout=widgets.Layout(width='400px')
    )
    # Button to add the typed keyword to the list immediately
    btn_add_exact = widgets.Button(
        description='Add to List',
        icon='plus',
        layout=widgets.Layout(width='100px')
    )

    # Info & Display
    w_info = widgets.HTML(value="<div style='color:#666; font-style:italic; margin-left:100px;'>Select a category...</div>")
    w_entity_display = widgets.HTML()
    output_log = widgets.Output()

    # Buttons
    btn_save = widgets.Button(description='Save & Next ‚û°Ô∏è', button_style='primary')
    btn_prev = widgets.Button(description='‚¨ÖÔ∏è Previous')

    # --- 3. LOGIC ---

    def get_split_history():
        """Reads all exact categories, splits by ';', and returns unique items for the dropdown."""
        all_vals = df['qh_exact_category'].dropna().unique()
        unique_items = set()
        for val in all_vals:
            if str(val) == "nan" or str(val).strip() == "": continue
            parts = [p.strip() for p in str(val).split(';')]
            for p in parts:
                if p: unique_items.add(p)
        return sorted(list(unique_items))

    def update_display():
        """Refreshes UI for current row."""
        if current_idx >= len(df):
            w_entity_display.value = "<div style='background:#d4edda; color:#155724; padding:15px;'><h3>üéâ All Done!</h3></div>"
            return

        row = df.loc[current_idx]
        
        # Display Context
        w_entity_display.value = f"""
        <div style="background-color: #f8f9fa; border-left: 5px solid #0d6efd; padding: 15px; margin-bottom: 10px;">
            <div style="margin-bottom: 5px; font-size: 0.9em; color: #495057;">
                <b>Original Entity:</b> <span style="font-family: monospace; font-size: 1.1em;">{row['entity_name']}</span>
            </div>
            <div style="font-size: 0.85em; color: #666;">
                Doc: {row.get('Document_Name', 'N/A')} | Year: {row.get('Year', 'N/A')}
            </div>
        </div>
        """
        
        # Name Edit
        existing_final = str(row['finalized_entity_name'])
        if existing_final and existing_final.strip() != "" and existing_final != "nan":
            w_name_edit.value = existing_final
        else:
            w_name_edit.value = str(row['entity_name'])

        # Load Categories
        current_cat = row['qh_category']
        current_sub = row['qh_sub_category']
        current_exact = str(row['qh_exact_category'])

        # 1. Set Helix
        if current_cat in w_helix.options:
            w_helix.value = current_cat
        elif current_cat:
            w_helix.options = list(w_helix.options)[:-1] + [current_cat, NEW_OPT]
            w_helix.value = current_cat
        else:
            w_helix.value = ''

        # 2. Set Sub (Trigger updates)
        update_sub_options(w_helix.value)
        if current_sub in w_sub.options:
            w_sub.value = current_sub
        elif current_sub:
            w_sub.options = list(w_sub.options)[:-1] + [current_sub, NEW_OPT]
            w_sub.value = current_sub
            
        # 3. Set Exact (Trigger updates based on Sub)
        update_exact_options(w_helix.value, w_sub.value)
        
        # Handle Pre-selection of Multiple Items
        if current_exact and current_exact != "nan" and current_exact.strip():
            # Split by semicolon
            selected_items = [x.strip() for x in current_exact.split(';')]
            # Ensure they exist in options
            current_options = list(w_exact_multi.options)
            for item in selected_items:
                if item not in current_options and item != "":
                    current_options.append(item)
            
            w_exact_multi.options = sorted(current_options)
            
            # Set Value (must be a tuple of matching strings)
            valid_selection = [x for x in selected_items if x in w_exact_multi.options]
            w_exact_multi.value = tuple(valid_selection)
        else:
            w_exact_multi.value = ()

        # Reset New Fields
        w_helix_new.layout.display = 'none'
        w_sub_new.layout.display = 'none'
        w_exact_new.value = '' # Clear new keyword box

        # Progress
        done_count = len(df[df['qh_category'] != ""])
        progress.value = done_count
        progress_label.value = f"{done_count} / {total_items} tagged"

    def update_sub_options(main_cat):
        if main_cat in TAXONOMY:
            opts = sorted(list(TAXONOMY[main_cat].keys()))
            w_sub.options = [''] + opts + [NEW_OPT]
            w_sub.disabled = False
        elif main_cat and main_cat != NEW_OPT:
            w_sub.options = [''] + [NEW_OPT]
            w_sub.disabled = False
        else:
            w_sub.options = []
            w_sub.disabled = True

    def update_exact_options(main, sub):
        """Populates SelectMultiple with Keywords + Single Item History"""
        options = []
        
        # 1. Add Taxonomy Keywords
        if main in TAXONOMY and sub in TAXONOMY[main]:
            keywords = sorted(TAXONOMY[main][sub].get('keywords', []))
            options += keywords
        
        # 2. Add Global History (Split individual items)
        history = get_split_history()
        # Merge and Unique
        combined = sorted(list(set(options + history)))
        
        w_exact_multi.options = combined

    # --- EVENT HANDLERS ---

    def on_helix_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            val = change['new']
            if val == NEW_OPT:
                w_helix_new.layout.display = 'block'
                w_sub.options = [''] + [NEW_OPT]
                w_sub.disabled = False
                w_info.value = ""
            else:
                w_helix_new.layout.display = 'none'
                update_sub_options(val)
                update_exact_options(val, '')
                w_info.value = ""

    def on_sub_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            val = change['new']
            main = w_helix.value
            
            if val == NEW_OPT:
                w_sub_new.layout.display = 'block'
                w_info.value = ""
            else:
                w_sub_new.layout.display = 'none'
                update_exact_options(main, val)
                
                if main in TAXONOMY and val in TAXONOMY[main]:
                    desc = TAXONOMY[main][val]['desc']
                    w_info.value = f"<div style='color:#0d6efd; margin-left:100px;'>‚ÑπÔ∏è {desc}</div>"

    def add_new_keyword(b):
        """Adds text from input box to the multiple selection list and selects it."""
        new_val = w_exact_new.value.strip()
        if new_val:
            # Add to options
            current_opts = list(w_exact_multi.options)
            if new_val not in current_opts:
                current_opts.append(new_val)
                w_exact_multi.options = sorted(current_opts)
            
            # Add to selection
            current_sel = list(w_exact_multi.value)
            if new_val not in current_sel:
                current_sel.append(new_val)
                w_exact_multi.value = tuple(current_sel)
            
            w_exact_new.value = "" # Clear input

    def save_and_next(b):
        global current_idx
        
        # 1. Name Logic
        original_name = str(df.at[current_idx, 'entity_name']).strip()
        new_name_input = str(w_name_edit.value).strip()
        
        if new_name_input != original_name:
            df.at[current_idx, 'modified_entity_name'] = True
            df.at[current_idx, 'finalized_entity_name'] = new_name_input
            name_log = f"Edited name"
        else:
            df.at[current_idx, 'modified_entity_name'] = False
            df.at[current_idx, 'finalized_entity_name'] = original_name
            name_log = "Name original"

        # 2. Category Logic
        val_cat = w_helix_new.value if w_helix.value == NEW_OPT else w_helix.value
        val_sub = w_sub_new.value if w_sub.value == NEW_OPT else w_sub.value
        
        # Process Exact: Join tuple values with semicolon
        selected_exacts = w_exact_multi.value
        val_exact = "; ".join(selected_exacts)
        
        # Learn Main
        if w_helix.value == NEW_OPT and val_cat:
            if val_cat not in TAXONOMY:
                TAXONOMY[val_cat] = {}
                opts = list(w_helix.options)
                opts.insert(-1, val_cat)
                w_helix.options = opts
        
        # Learn Sub
        if w_sub.value == NEW_OPT and val_sub:
            if val_cat not in TAXONOMY: TAXONOMY[val_cat] = {}
            if val_sub not in TAXONOMY[val_cat]:
                TAXONOMY[val_cat][val_sub] = {'desc': 'User Added', 'keywords': []}
            opts = list(w_sub.options)
            opts.insert(-1, val_sub)
            w_sub.options = opts

        # Exact History Learning is handled automatically next time get_split_history is called on the DF

        # Save to DF
        df.at[current_idx, 'qh_category'] = val_cat
        df.at[current_idx, 'qh_sub_category'] = val_sub
        df.at[current_idx, 'qh_exact_category'] = val_exact
        
        df.to_csv(filename, index=False)
        
        with output_log:
            clear_output(wait=True)
            print(f"‚úÖ Saved. ({name_log})")

        # Next
        if current_idx < len(df) - 1:
            current_idx += 1
            update_display()
        else:
            with output_log:
                print("End of list reached.")

    def go_prev(b):
        global current_idx
        if current_idx > 0:
            current_idx -= 1
            update_display()

    # --- 4. LAYOUT ---
    w_helix.observe(on_helix_change)
    w_sub.observe(on_sub_change)
    btn_add_exact.on_click(add_new_keyword)
    
    btn_save.on_click(save_and_next)
    btn_prev.on_click(go_prev)

    update_display()

    ui = widgets.VBox([
        widgets.HBox([progress, progress_label]),
        w_entity_display,
        
        w_name_edit,
        widgets.HTML("<div style='height:10px;'></div>"),

        w_helix, w_helix_new,
        w_sub, w_sub_new,
        w_info,
        
        widgets.HTML("<hr style='margin: 5px 0; border:0; border-top:1px solid #eee;'>"),
        
        # Exact Multi Select Section
        widgets.VBox([
            w_exact_multi,
            w_multi_help,
            widgets.HBox([w_exact_new, btn_add_exact])
        ]),
        
        widgets.HTML("<hr style='margin: 10px 0;'>"),
        widgets.HBox([btn_prev, btn_save]),
        output_log
    ])
    
    display(ui)

else:
    print("DataFrame is empty.")

VBox(children=(HBox(children=(IntProgress(value=122, bar_style='success', description='Progress:', layout=Layo‚Ä¶

In [None]:
# Run again deduplication and save
#TODO Run again deduplication and save: since modified_entities might have created duplicates
# Eigentlich ja noch Entities und deren Abk√ºrzungen zusammenfassen sonst gibt es ja bei Entities mit Abk√ºrzungen doppelte Eintr√§ge
