In [97]:
# Text Extraction: Final Version with Doc ID

import pymupdf
import re
import csv
from pathlib import Path
from multi_column import column_boxes


def clean_text(text: str) -> str:
    """
    Bereinigt Text: Fixiert Silbentrennung, Initialen und Whitespace.
    """

    if not text:
        return ""

    # 1. Fix Hyphenation (Word-\npart -> Wordpart)
    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)

    # 3. Collapse whitespace
    return re.sub(r"\s+", " ", text).strip()

def process_folder(folder_path: str, output_csv: str):
    """
    Iterates through all PDFs in folder_path, extracts metadata from filenames,
    extracts/cleans text from pages, and writes everything to a single CSV.
    """

    pdf_dir = Path(folder_path)
    
    # Check if directory exists
    if not pdf_dir.exists():
        print(f"Directory not found: {folder_path}")
        return

    print(f"Scanning folder: {folder_path}...")

    # Open CSV file once for writing
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)

        # Update Header: Added "Doc_ID" as the first column
        writer.writerow(["Doc_ID", "Country", "Year", "Document_Name", "Page", "Block_ID", "text"])

        # Iterate over all PDF files in the directory
        pdf_files = list(pdf_dir.glob("*.pdf"))

        if not pdf_files:
            print("No PDF files found in the directory.")
            return

        # Enumerate gives us a counter (doc_id) starting at 0
        for doc_id, pdf_path in enumerate(pdf_files, start=0):

            # --- Metadata Extraction ---
            parts = pdf_path.stem.split("_")
            country = parts[0]

            # Check if second part is a year (digits)
            year = parts[1] if len(parts) > 1 and parts[1].isdigit() else ""

            # Join the rest as the document name
            doc_name = "_".join(parts[2:]) if len(parts) > 2 else ""

            print(f"Processing ID {doc_id}: {pdf_path.name} | Country: {country}, Year: {year}")

            try:

                # --- PDF Text Extraction ---
                doc = pymupdf.open(pdf_path)

                for page_num, page in enumerate(doc, start=1):

                    # Column detection (Bounding Boxes)
                    bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

                    for block_id, rect in enumerate(bboxes, start=1):

                        # Extract text from the specific box
                        raw_text = page.get_text(clip=rect, sort=True)

                        # Clean text
                        final_text = clean_text(raw_text)

                        # Write to CSV if text exists
                        if final_text:
                            # Added doc_id to the row data
                            writer.writerow([doc_id, country, year, doc_name, page_num, block_id, final_text])

                doc.close()

            except Exception as e:
                print(f"Failed to process {pdf_path.name}: {e}")


    print(f"Extraction complete. All data saved in '{output_csv}'.")


# --- Main Execution ---
if __name__ == "__main__":

    # Define your folder and output filename here
    input_folder = "countries_edited"
    output_filename = "text_countries_edited.csv"

    process_folder(input_folder, output_filename)

Scanning folder: countries_edited...


PermissionError: [Errno 13] Permission denied: 'text_countries_edited.csv'

In [74]:
# Load csv to df
import pandas as pd
output_filename = "text_countries_edited.csv"
df = pd.read_csv(output_filename)
print(df.head())

   Doc_ID Country  Year                       Document_Name  Page  Block_ID  \
0       0     CAN  2022  Canada's National Quantum Strategy     2         1   
1       0     CAN  2022  Canada's National Quantum Strategy     3         1   
2       0     CAN  2022  Canada's National Quantum Strategy     3         2   
3       0     CAN  2022  Canada's National Quantum Strategy     4         1   
4       0     CAN  2022  Canada's National Quantum Strategy     4         2   

                                                text  
0  Executive summary dvances in quantum science h...  
1  Three key missions The National Quantum Strate...  
2  Next steps To strengthen Canada‚Äôs quantum ecos...  
3                          Canada: A quantum pioneer  
4  ince the birth of quantum science more than 10...  


In [None]:
import pandas as pd
import flair
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single
import torch

# --- 1. SETUP & CONFIGURATION ---

# Auto-detect GPU for massive speedup
flair.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {flair.device}")

# Load the FAST model (20x faster, ~98% relative accuracy of large model)
tagger = SequenceTagger.load('ner-fast')

# --- 2. LOAD & PREPARE DATA ---

# Ensure text is string and handle missing values
# Note: Ensure 'df' is defined before this step (e.g., df = pd.read_csv(...))
df["text"] = df["text"].fillna("").astype(str)
print("Preparing data for batch processing...")

# We need to flatten the data: Documents -> Sentences.
# We use 'row_mapping' to remember which original row/block a sentence came from.

all_sentences = []
row_mapping = []

for idx, row in df.iterrows():
    text = row['text']

    # Skip extremely short garbage text
    if len(text) < 2:
        continue

    # Split text into sentences (essential for Flair accuracy)
    sentences_raw = split_single(text)

    for sent_raw in sentences_raw:
        if sent_raw.strip():
            sent_obj = Sentence(sent_raw)
            all_sentences.append(sent_obj)
            row_mapping.append(idx)  # Store the index of the original row

print(f"Total sentences to process: {len(all_sentences)}")

# --- 3. BATCH PREDICTION (High Speed) ---

# Batch size: 32 for CPU, 128+ for GPU
BATCH_SIZE = 256 if torch.cuda.is_available() else 32

print("Starting batched prediction...")
tagger.predict(all_sentences, mini_batch_size=BATCH_SIZE, verbose=True)

# --- 4. EXTRACT & MERGE METADATA ---

print("Extracting entities and merging metadata...")

# Set used for deduplication per original block
# Stores: (original_index, entity_text, entity_label)
unique_entities = set()

# Iterate through predictions and map them back to original row index
for idx, sent_obj in zip(row_mapping, all_sentences):
    for entity in sent_obj.get_spans('ner'):
        # Filter for only relevant tags
        if entity.tag in ['ORG', 'PER']:
            # Add to set (deduplicates if same entity appears twice in same block)
            unique_entities.add((idx, entity.text, entity.tag))

# --- 5. BUILD FINAL DATAFRAME ---

final_rows = []

for idx, entity_name, ner_label in unique_entities:
    # Retrieve the original row metadata using the index
    original_row = df.loc[idx]

    final_rows.append({
        "Country": original_row.get("Country", ""),
        "Year": original_row.get("Year", ""),
        "Document_Name": original_row.get("Document_Name", ""),
        "Page": original_row.get("Page", ""),
        "Block_ID": original_row.get("Block_ID", ""),
        "entity_name": entity_name,
        "ner_label": ner_label,
        "qh_category": "",        # Empty for manual input
        "qh_sub_category": "",    # Empty for manual input
        "qh_exact_category": ""   # Empty for manual input
    })

output_df = pd.DataFrame(final_rows)

# Sort for easier manual tagging (Group by Document, then Page, then Entity)
sort_cols = [c for c in ["Document_Name", "Page", "entity_name"] if c in output_df.columns]
if sort_cols:
    output_df = output_df.sort_values(by=sort_cols)

# --- 6. SAVE TO CSV ---

output_filename = "entities.csv"
output_filename_to_edit = "entities_to_edit.csv"

output_df.to_csv(output_filename, index=False)
output_df.to_csv(output_filename_to_edit, index=False)

print(f"--- Processing Complete ---")
print(f"Extracted {len(output_df)} entities.")
print(f"Saved to: {output_filename}")
print(output_df.head())

Using device: cpu
2026-01-14 20:55:39,604 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Preparing data for batch processing...
Total sentences to process: 1359
Starting batched prediction...


Batch inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [01:43<00:00,  2.40s/it]

Extracting entities and merging metadata...
--- Processing Complete ---
Extracted 937 entities.
Saved to: entities2.csv
    Doc_ID Sentence_ID Country  Year                       Document_Name  \
750      1          10     CAN  2022  Canada's National Quantum Strategy   
917      1         102     CAN  2022  Canada's National Quantum Strategy   
852      1         111     CAN  2022  Canada's National Quantum Strategy   
316      1         118     CAN  2022  Canada's National Quantum Strategy   
468      1         118     CAN  2022  Canada's National Quantum Strategy   

                     entity_name ner_label qh_category qh_sub_category  \
750         Government of Canada       ORG                               
917  Standards Council of Canada       ORG                               
852               Bank of Canada       ORG                               
316            Quantum Computing       ORG                               
468  Innovative Solutions Canada       ORG           




In [10]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- 1. CONFIGURATION & DATA LOADING ---
filename = "2entities_to_edit.csv"

# --- TAXONOMY (General English) ---
TAXONOMY = {
    "Academia": {
        "Higher Education Institutions (HEIs)": {
            "desc": "Universities, Colleges, Schools (Teaching & Research).",
            "keywords": [
                "University", "College", "School", "Academy", "Faculty", "Department", 
                "Chair", "Campus", "Institute of Technology", "Polytechnic", 
                "Business School", "Medical School", "Law School"
            ]
        },
        "Public Research Orgs (PROs)": {
            "desc": "Research Institutes (Knowledge Output, no teaching).",
            "keywords": [
                "Institute", "Center", "Centre", "Laboratory", "Lab", "Observatory", 
                "National Lab", "Research Council", "Think Tank", "Agency (Research)"
            ]
        }
    },
    "Industry": {
        "Private Firms (Corporates)": {
            "desc": "Established Companies, SMEs, MNEs.",
            "keywords": [
                "Inc", "Corp", "Corporation", "Ltd", "LLC", "PLC", "Co", "Company", 
                "Group", "Holdings", "Manufacturer", "Supplier", "Vendor", 
                "Conglomerate", "Multinational", "Enterprise", "Firm"
            ]
        },
        "Start-ups": {
            "desc": "Young Growth Companies, Spin-offs.",
            "keywords": [
                "Start-up", "Startup", "Spin-off", "Spinoff", "Scale-up", "Unicorn", 
                "Venture", "NewCo", "DeepTech", "Founder", "Stealth Mode"
            ]
        },
        "Consulting": {
            "desc": "Services, Advisory, Legal, HR.",
            "keywords": [
                "Consulting", "Consultancy", "Advisors", "Partners", "Legal", "Law Firm", 
                "LLP", "Attorney", "IP Law", "Patent", "Audit", "Tax", "Recruitment", 
                "Headhunter", "Strategy", "Management", "Services"
            ]
        },
        "Venture Capital / Investors": {
            "desc": "Financial Actors, VCs, Business Angels.",
            "keywords": [
                "Capital", "Invest", "Investment", "Fund", "Venture", "VC", "Equity", 
                "Private Equity", "PE", "Angel", "Seed", "Asset Management", "Bank", 
                "Financial Group", "Holding", "Wealth Management"
            ]
        }
    },
    "Government": {
        "Policy Makers": {
            "desc": "Ministries, Councils, Parliaments (Regulation).",
            "keywords": [
                "Ministry", "Department", "Dept", "Council", "Government", "Federal", 
                "State", "Municipality", "City", "County", "District", "Parliament", 
                "Senate", "Commission", "Mayor", "Governor", "Regulator", "Authority", 
                "Administration", "Bureau"
            ]
        },
        "Funding Agencies": {
            "desc": "Funding Bodies, Project Management Agencies.",
            "keywords": [
                "Foundation", "Agency", "Grant", "Funding", "Fund", "Endowment", 
                "Trust", "Award", "Scholarship", "Fellowship", "Program", "Initiative"
            ]
        }
    },
    "Civil Society": {
        "Media": {
            "desc": "Press, News, Journals.",
            "keywords": [
                "News", "Journal", "Press", "Times", "Post", "Daily", "Review", 
                "Magazine", "Publisher", "Broadcaster", "TV", "Radio", "Podcast", 
                "Blog", "Media", "Outlet", "Chronicle", "Gazette"
            ]
        },
        "Cultural Institutions": {
            "desc": "Museums, Libraries, Galleries.",
            "keywords": [
                "Museum", "Library", "Gallery", "Theater", "Opera", "Orchestra", 
                "Archive", "Collection", "Exhibition", "Zoo", "Botanical Garden", 
                "Science Center", "Planetarium", "Hall"
            ]
        },
        "NGOs / NPOs": {
            "desc": "Non-Profit, Social Goals, Charities.",
            "keywords": [
                "Charity", "Non-Profit", "NPO", "NGO", "Organization", "Society", 
                "Club", "Union", "Alliance", "Federation", "Initiative", "Philanthropy", 
                "Foundation (Private)", "Mission", "Relief"
            ]
        },
        "Intermediaries": {
            "desc": "Clusters, Hubs, TTOs, Chambers.",
            "keywords": [
                "Cluster", "Network", "Hub", "Incubator", "Accelerator", "TTO", 
                "Technology Transfer", "Chamber of Commerce", "Trade Union", 
                "Association", "Consortium", "Standardization", "Body", "Council (Trade)"
            ]
        },
        "Citizens / Users": {
            "desc": "Citizens, Patients, User Groups.",
            "keywords": [
                "Community", "Group", "Public", "Citizen", "Patient", "User", 
                "Resident", "Population", "Crowd", "Forum", "Volunteer", "Advocacy"
            ]
        }
    }
}

try:
    df = pd.read_csv(filename).fillna("")
    required_cols = ["qh_category", "qh_sub_category", "qh_exact_category", "modified_entity_name", "finalized_entity_name"]
    for col in required_cols:
        if col not in df.columns:
            df[col] = ""
except FileNotFoundError:
    print(f"Error: '{filename}' not found.")
    df = pd.DataFrame()

# --- 2. WIDGET SETUP ---

if not df.empty:
    unfinished_indices = df[df['qh_category'] == ""].index.tolist()
    current_idx = unfinished_indices[0] if unfinished_indices else 0
    NEW_OPT = "+++ Create New +++"

    # Progress Bar
    total_items = len(df)
    progress = widgets.IntProgress(
        value=len(df) - len(unfinished_indices),
        min=0,
        max=total_items,
        description='Progress:',
        bar_style='success',
        layout=widgets.Layout(width='99%')
    )
    progress_label = widgets.Label(value=f"{progress.value} / {total_items} tagged")

    # --- INPUT WIDGETS ---

    # 0. Entity Name Editor
    w_name_edit = widgets.Text(
        description='<b>Edit Name:</b>',
        placeholder='Correct the entity name here...',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='600px')
    )

    # 1. Main Category
    w_helix = widgets.Dropdown(
        options=[''] + list(TAXONOMY.keys()) + [NEW_OPT],
        description='1. Helix:',
        layout=widgets.Layout(width='400px')
    )
    w_helix_new = widgets.Text(
        placeholder='Type new Helix category...',
        layout=widgets.Layout(width='400px', display='none') 
    )

    # 2. Sub Category
    w_sub = widgets.Dropdown(
        options=[],
        description='2. Type:',
        layout=widgets.Layout(width='400px'),
        disabled=True
    )
    w_sub_new = widgets.Text(
        placeholder='Type new Sub-Category...',
        layout=widgets.Layout(width='400px', display='none')
    )

    # 3. Exact Category (MULTI SELECT)
    w_exact_multi = widgets.SelectMultiple(
        options=[],
        description='3. Exact:',
        rows=8, # Height of the box
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )
    
    # Helper text for multi-select
    w_multi_help = widgets.HTML(
        value="<div style='font-size:0.8em; color:#666; margin-left:100px;'><i>Hold <b>Ctrl</b> (Win) or <b>Cmd</b> (Mac) to select multiple.</i></div>"
    )

    w_exact_new = widgets.Text(
        placeholder='Type NEW keyword here...',
        layout=widgets.Layout(width='400px')
    )
    # Button to add the typed keyword to the list immediately
    btn_add_exact = widgets.Button(
        description='Add to List',
        icon='plus',
        layout=widgets.Layout(width='100px')
    )

    # Info & Display
    w_info = widgets.HTML(value="<div style='color:#666; font-style:italic; margin-left:100px;'>Select a category...</div>")
    w_entity_display = widgets.HTML()
    output_log = widgets.Output()

    # Buttons
    btn_save = widgets.Button(description='Save & Next ‚û°Ô∏è', button_style='primary')
    btn_prev = widgets.Button(description='‚¨ÖÔ∏è Previous')

    # --- 3. LOGIC ---

    def get_split_history():
        """Reads all exact categories, splits by ';', and returns unique items for the dropdown."""
        all_vals = df['qh_exact_category'].dropna().unique()
        unique_items = set()
        for val in all_vals:
            if str(val) == "nan" or str(val).strip() == "": continue
            parts = [p.strip() for p in str(val).split(';')]
            for p in parts:
                if p: unique_items.add(p)
        return sorted(list(unique_items))

    def update_display():
        """Refreshes UI for current row."""
        if current_idx >= len(df):
            w_entity_display.value = "<div style='background:#d4edda; color:#155724; padding:15px;'><h3>üéâ All Done!</h3></div>"
            return

        row = df.loc[current_idx]
        
        # Display Context
        w_entity_display.value = f"""
        <div style="background-color: #f8f9fa; border-left: 5px solid #0d6efd; padding: 15px; margin-bottom: 10px;">
            <div style="margin-bottom: 5px; font-size: 0.9em; color: #495057;">
                <b>Original Entity:</b> <span style="font-family: monospace; font-size: 1.1em;">{row['entity_name']}</span>
            </div>
            <div style="font-size: 0.85em; color: #666;">
                Doc: {row.get('Document_Name', 'N/A')} | Year: {row.get('Year', 'N/A')}
            </div>
        </div>
        """
        
        # Name Edit
        existing_final = str(row['finalized_entity_name'])
        if existing_final and existing_final.strip() != "" and existing_final != "nan":
            w_name_edit.value = existing_final
        else:
            w_name_edit.value = str(row['entity_name'])

        # Load Categories
        current_cat = row['qh_category']
        current_sub = row['qh_sub_category']
        current_exact = str(row['qh_exact_category'])

        # 1. Set Helix
        if current_cat in w_helix.options:
            w_helix.value = current_cat
        elif current_cat:
            w_helix.options = list(w_helix.options)[:-1] + [current_cat, NEW_OPT]
            w_helix.value = current_cat
        else:
            w_helix.value = ''

        # 2. Set Sub (Trigger updates)
        update_sub_options(w_helix.value)
        if current_sub in w_sub.options:
            w_sub.value = current_sub
        elif current_sub:
            w_sub.options = list(w_sub.options)[:-1] + [current_sub, NEW_OPT]
            w_sub.value = current_sub
            
        # 3. Set Exact (Trigger updates based on Sub)
        update_exact_options(w_helix.value, w_sub.value)
        
        # Handle Pre-selection of Multiple Items
        if current_exact and current_exact != "nan" and current_exact.strip():
            # Split by semicolon
            selected_items = [x.strip() for x in current_exact.split(';')]
            # Ensure they exist in options
            current_options = list(w_exact_multi.options)
            for item in selected_items:
                if item not in current_options and item != "":
                    current_options.append(item)
            
            w_exact_multi.options = sorted(current_options)
            
            # Set Value (must be a tuple of matching strings)
            valid_selection = [x for x in selected_items if x in w_exact_multi.options]
            w_exact_multi.value = tuple(valid_selection)
        else:
            w_exact_multi.value = ()

        # Reset New Fields
        w_helix_new.layout.display = 'none'
        w_sub_new.layout.display = 'none'
        w_exact_new.value = '' # Clear new keyword box

        # Progress
        done_count = len(df[df['qh_category'] != ""])
        progress.value = done_count
        progress_label.value = f"{done_count} / {total_items} tagged"

    def update_sub_options(main_cat):
        if main_cat in TAXONOMY:
            opts = sorted(list(TAXONOMY[main_cat].keys()))
            w_sub.options = [''] + opts + [NEW_OPT]
            w_sub.disabled = False
        elif main_cat and main_cat != NEW_OPT:
            w_sub.options = [''] + [NEW_OPT]
            w_sub.disabled = False
        else:
            w_sub.options = []
            w_sub.disabled = True

    def update_exact_options(main, sub):
        """Populates SelectMultiple with Keywords + Single Item History"""
        options = []
        
        # 1. Add Taxonomy Keywords
        if main in TAXONOMY and sub in TAXONOMY[main]:
            keywords = sorted(TAXONOMY[main][sub].get('keywords', []))
            options += keywords
        
        # 2. Add Global History (Split individual items)
        history = get_split_history()
        # Merge and Unique
        combined = sorted(list(set(options + history)))
        
        w_exact_multi.options = combined

    # --- EVENT HANDLERS ---

    def on_helix_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            val = change['new']
            if val == NEW_OPT:
                w_helix_new.layout.display = 'block'
                w_sub.options = [''] + [NEW_OPT]
                w_sub.disabled = False
                w_info.value = ""
            else:
                w_helix_new.layout.display = 'none'
                update_sub_options(val)
                update_exact_options(val, '')
                w_info.value = ""

    def on_sub_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            val = change['new']
            main = w_helix.value
            
            if val == NEW_OPT:
                w_sub_new.layout.display = 'block'
                w_info.value = ""
            else:
                w_sub_new.layout.display = 'none'
                update_exact_options(main, val)
                
                if main in TAXONOMY and val in TAXONOMY[main]:
                    desc = TAXONOMY[main][val]['desc']
                    w_info.value = f"<div style='color:#0d6efd; margin-left:100px;'>‚ÑπÔ∏è {desc}</div>"

    def add_new_keyword(b):
        """Adds text from input box to the multiple selection list and selects it."""
        new_val = w_exact_new.value.strip()
        if new_val:
            # Add to options
            current_opts = list(w_exact_multi.options)
            if new_val not in current_opts:
                current_opts.append(new_val)
                w_exact_multi.options = sorted(current_opts)
            
            # Add to selection
            current_sel = list(w_exact_multi.value)
            if new_val not in current_sel:
                current_sel.append(new_val)
                w_exact_multi.value = tuple(current_sel)
            
            w_exact_new.value = "" # Clear input

    def save_and_next(b):
        global current_idx
        
        # 1. Name Logic
        original_name = str(df.at[current_idx, 'entity_name']).strip()
        new_name_input = str(w_name_edit.value).strip()
        
        if new_name_input != original_name:
            df.at[current_idx, 'modified_entity_name'] = True
            df.at[current_idx, 'finalized_entity_name'] = new_name_input
            name_log = f"Edited name"
        else:
            df.at[current_idx, 'modified_entity_name'] = False
            df.at[current_idx, 'finalized_entity_name'] = original_name
            name_log = "Name original"

        # 2. Category Logic
        val_cat = w_helix_new.value if w_helix.value == NEW_OPT else w_helix.value
        val_sub = w_sub_new.value if w_sub.value == NEW_OPT else w_sub.value
        
        # Process Exact: Join tuple values with semicolon
        selected_exacts = w_exact_multi.value
        val_exact = "; ".join(selected_exacts)
        
        # Learn Main
        if w_helix.value == NEW_OPT and val_cat:
            if val_cat not in TAXONOMY:
                TAXONOMY[val_cat] = {}
                opts = list(w_helix.options)
                opts.insert(-1, val_cat)
                w_helix.options = opts
        
        # Learn Sub
        if w_sub.value == NEW_OPT and val_sub:
            if val_cat not in TAXONOMY: TAXONOMY[val_cat] = {}
            if val_sub not in TAXONOMY[val_cat]:
                TAXONOMY[val_cat][val_sub] = {'desc': 'User Added', 'keywords': []}
            opts = list(w_sub.options)
            opts.insert(-1, val_sub)
            w_sub.options = opts

        # Exact History Learning is handled automatically next time get_split_history is called on the DF

        # Save to DF
        df.at[current_idx, 'qh_category'] = val_cat
        df.at[current_idx, 'qh_sub_category'] = val_sub
        df.at[current_idx, 'qh_exact_category'] = val_exact
        
        df.to_csv(filename, index=False)
        
        with output_log:
            clear_output(wait=True)
            print(f"‚úÖ Saved. ({name_log})")

        # Next
        if current_idx < len(df) - 1:
            current_idx += 1
            update_display()
        else:
            with output_log:
                print("End of list reached.")

    def go_prev(b):
        global current_idx
        if current_idx > 0:
            current_idx -= 1
            update_display()

    # --- 4. LAYOUT ---
    w_helix.observe(on_helix_change)
    w_sub.observe(on_sub_change)
    btn_add_exact.on_click(add_new_keyword)
    
    btn_save.on_click(save_and_next)
    btn_prev.on_click(go_prev)

    update_display()

    ui = widgets.VBox([
        widgets.HBox([progress, progress_label]),
        w_entity_display,
        
        w_name_edit,
        widgets.HTML("<div style='height:10px;'></div>"),

        w_helix, w_helix_new,
        w_sub, w_sub_new,
        w_info,
        
        widgets.HTML("<hr style='margin: 5px 0; border:0; border-top:1px solid #eee;'>"),
        
        # Exact Multi Select Section
        widgets.VBox([
            w_exact_multi,
            w_multi_help,
            widgets.HBox([w_exact_new, btn_add_exact])
        ]),
        
        widgets.HTML("<hr style='margin: 10px 0;'>"),
        widgets.HBox([btn_prev, btn_save]),
        output_log
    ])
    
    display(ui)

else:
    print("DataFrame is empty.")

VBox(children=(HBox(children=(IntProgress(value=648, bar_style='success', description='Progress:', layout=Layo‚Ä¶

In [None]:
# Run again deduplication and save
#TODO Run again deduplication and save: since modified_entities might have created duplicates
# Eigentlich ja noch Entities und deren Abk√ºrzungen zusammenfassen sonst gibt es ja bei Entities mit Abk√ºrzungen doppelte Eintr√§ge


In [None]:
# Cell 6: Deduplication and Final Data Prep
import pandas as pd

# 1. Load the manually edited file
df_edited = pd.read_csv("2entities_to_edit.csv", dtype=str).fillna("")

# 2. Filter for valid entities only
# We only want rows where a category has been assigned (Human verified)
df_final = df_edited[df_edited['qh_category'] != ""].copy()

# 3. Standardize Names
# Use 'finalized_entity_name' as the source of truth. 
# If it's empty (no edit), fall back to 'entity_name'.
df_final['Label'] = df_final['finalized_entity_name'].where(
    df_final['finalized_entity_name'] != "", 
    df_final['entity_name']
)

# 4. Drop Duplicates within the same Context (Block)
# If "Canada" appears twice in Block 1, we only need it once for the network.
df_unique_context = df_final.drop_duplicates(subset=['Doc_ID', 'Block_ID', 'Label'])

print(f"Unique Actor Occurrences: {len(df_unique_context)}")
print(df_unique_context[['Doc_ID', 'Label', 'qh_category']].head())

# Save this for the network step
df_unique_context.to_csv("entities_ready_for_network.csv", index=False)

Unique Actor Occurrences: 161
  Doc_ID                                Label    qh_category
0      0                             Acountry  not specified
1      0        The National Quantum Strategy   not an actor
2      0                                  NQS   not an actor
3      0                 Government of Canada     Government
4      0  National Research Council of Canada       Academia


In [2]:
import pandas as pd

# Load the data
df = pd.read_csv('2entities_to_edit.csv')

# 1. Explode the 'qh_exact_category' column 
# This splits strings like "Accelerator; Alliance" into separate rows so we can count them
df_exploded = df.copy()
df_exploded['qh_exact_category'] = df_exploded['qh_exact_category'].fillna('not specified').astype(str).str.split(';')
df_exploded = df_exploded.explode('qh_exact_category')
df_exploded['qh_exact_category'] = df_exploded['qh_exact_category'].str.strip()

# 2. Group the data for the hierarchy
# We use the original df for top-level counts and exploded df for exact category counts
final_counts = df_exploded.groupby(['qh_category', 'qh_sub_category', 'qh_exact_category']).size().reset_index(name='exact_count')

# 3. Print the Hierarchical Structure
unique_categories = df['qh_category'].dropna().unique()

for cat in unique_categories:
    # Top Level: Category
    cat_total = df[df['qh_category'] == cat].shape[0]
    print(f"\n{cat}: absolute number of mentions: {cat_total}")
    
    # Second Level: Sub-category
    sub_df = final_counts[final_counts['qh_category'] == cat]
    for sub in sub_df['qh_sub_category'].unique():
        sub_total = df[(df['qh_category'] == cat) & (df['qh_sub_category'] == sub)].shape[0]
        print(f"  -> {sub} ({sub_total} mentions)")
        
        # Third Level: Exact categories
        exact_df = sub_df[sub_df['qh_sub_category'] == sub]
        # Format the exact categories into a readable string
        exact_list = [f"{row['qh_exact_category']} ({row['exact_count']})" for _, row in exact_df.iterrows()]
        print(f"    -> Exact categories: {', '.join(exact_list)}")


not an actor: absolute number of mentions: 66
  -> not an actor (66 mentions)
    -> Exact categories: EU (2), Financial Plan (1), Financial Vehicle (2), Fund (3), Funding (3), Grant Program (1), Infrastructure Initiative (1), Mission (1), Policy Framework (2), Program (23), Strategic Document (9), country (1), not an actor (58), not specified (1)

Government: absolute number of mentions: 82
  -> Bank (3 mentions)
    -> Exact categories: Corporate VC (3), Funding (3)
  -> Federal Agency (3 mentions)
    -> Exact categories: Europe (1), Federal Agency (3), International Standard Body (1), Regulatory Authority (2), Space Agency (1), US (2)
  -> Federal Research Organization (2 mentions)
    -> Exact categories: Federal Research Agency (2)
  -> Funding Agencies (28 mentions)
    -> Exact categories: Advisory Service (1), Federal Agency (11), Funding (12), Grant (2), International Partnership (6), Research Council (4), SME Support (1), US (1)
  -> Governent Support Service (1 mentions)
 