In [1]:
import csv
import nltk
import pandas as pd
import flair
import torch
from pathlib import Path
from flair.data import Sentence
from flair.models import SequenceTagger

# --- 1. Setup Environment ---

# Download NLTK tokenizer if missing
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

# Set device (GPU is much faster if available)
flair.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Flair device: {flair.device}")

# Load the NER model once to avoid reloading it in loops
# 'ner-fast' is recommended for speed; use 'ner' for slightly higher accuracy
print("Loading NER model...")
tagger = SequenceTagger.load('ner-fast')
print("Model loaded successfully.")

Using Flair device: cpu
Loading NER model...
2026-01-19 15:25:53,716 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Model loaded successfully.


In [6]:
# --- 2. Process Files & Split by PARAGRAPH ---

def process_folder_to_paragraphs(input_folder_path, output_csv_path):
    input_dir = Path(input_folder_path)
    
    if not input_dir.exists():
        print(f"Error: Directory '{input_folder_path}' not found.")
        return

    with open(output_csv_path, 'w', newline='', encoding='utf-8') as out_f:
        writer = csv.writer(out_f)
        
        # Header (Changed 'Sentence_ID' to 'Paragraph_ID')
        header = ["Paragraph_ID", "Doc_ID", "Country", "Year", "Document_Name", "text"]
        writer.writerow(header)

        text_files = list(input_dir.glob("*.txt"))
        
        print(f"Found {len(text_files)} files. Processing into paragraphs...")

        for doc_id, file_path in enumerate(text_files, start=1):
            try:
                # Metadata Extraction
                parts = file_path.stem.split("_")
                country = parts[0]
                year = parts[1] if len(parts) > 1 and parts[1].isdigit() else "Unknown"
                start_index = 2 if len(parts) > 1 and parts[1].isdigit() else 1
                doc_name = "_".join(parts[start_index:]) if len(parts) > start_index else "Unknown"

                # --- READ & SPLIT BY PARAGRAPH ---
                with open(file_path, 'r', encoding='utf-8') as f:
                    raw_content = f.read()

                # 1. Split by double newlines (standard paragraph separator)
                raw_paragraphs = raw_content.split('\n\n')

                para_id = 1
                for p in raw_paragraphs:
                    # 2. Clean up the paragraph: 
                    # Replace single newlines inside the paragraph with spaces
                    # Strip leading/trailing whitespace
                    clean_para = p.replace('\n', ' ').strip()

                    # Skip empty paragraphs
                    if len(clean_para) < 5: 
                        continue

                    writer.writerow([
                        para_id,        # Paragraph ID
                        doc_id,
                        country,
                        year,
                        doc_name,
                        clean_para      # The full paragraph text
                    ])
                    para_id += 1

            except Exception as e:
                print(f"Error processing {file_path.name}: {e}")

    print(f"Success! Paragraph data written to: {output_csv_path}")

# Run the function
input_folder = "countries_edited" 
intermediate_file = "output.csv"

process_folder_to_paragraphs(input_folder, intermediate_file)

Found 4 files. Processing into paragraphs...
Success! Paragraph data written to: output.csv


In [7]:
# --- 3. Run NER on Paragraphs & Deduplicate ---

print("Reading paragraph CSV...")
df = pd.read_csv(intermediate_file)

# Ensure text column is string
df["text"] = df["text"].fillna("").astype(str)

# Prepare Flair Sentences (Each "Sentence" object is now actually a full Paragraph)
flair_paragraphs = []
valid_indices = [] 

for idx, row in df.iterrows():
    text = row['text']
    # Skip short/garbage text
    if len(text) < 5: continue
        
    # Create Flair Sentence (Here it holds a whole paragraph)
    # Flair can handle this, but ensures the model sees the local context
    sent_obj = Sentence(text)
    flair_paragraphs.append(sent_obj)
    valid_indices.append(idx)

print(f"Running NER on {len(flair_paragraphs)} paragraphs...")

# Batch Prediction
BATCH_SIZE = 256 if torch.cuda.is_available() else 32
tagger.predict(flair_paragraphs, mini_batch_size=BATCH_SIZE, verbose=True)

# --- Deduplication (Per Document) ---
print("Extracting entities...")

final_rows = []
seen_entities_per_doc = set()

for i, sent_obj in enumerate(flair_paragraphs):
    original_idx = valid_indices[i]
    original_row = df.iloc[original_idx]
    current_doc_id = original_row["Doc_ID"]

    for entity in sent_obj.get_spans('ner'):
        if entity.tag in ['ORG', 'PER']:
            
            dedup_key = (current_doc_id, entity.text, entity.tag)
            
            if dedup_key not in seen_entities_per_doc:
                seen_entities_per_doc.add(dedup_key)
                
                final_rows.append({
                    "Paragraph_ID": original_row["Paragraph_ID"],
                    "Doc_ID": current_doc_id,
                    "Country": original_row["Country"],
                    "Year": original_row["Year"],
                    "Document_Name": original_row["Document_Name"],
                    "entity_name": entity.text,
                    "ner_label": entity.tag,
                    "qh_category": "", 
                    "qh_sub_category": "",
                    "qh_exact_category": ""
                })

output_df = pd.DataFrame(final_rows)
output_filename = "entities.csv"
output_df.to_csv(output_filename, index=False)
output_filename = "entities_to_edit.csv"
output_df.to_csv(output_filename, index=False)

print(f"--- Processing Complete ---")
print(f"Saved to: {output_filename}")
output_df.head()

Reading paragraph CSV...
Running NER on 528 paragraphs...


Batch inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17/17 [03:03<00:00, 10.79s/it]


Extracting entities...
--- Processing Complete ---
Saved to: entities_to_edit.csv


Unnamed: 0,Paragraph_ID,Doc_ID,Country,Year,Document_Name,entity_name,ner_label,qh_category,qh_sub_category,qh_exact_category
0,2,1,CAN,2022,Canada's National Quantum Strategy,The National Quantum Strategy,ORG,,,
1,2,1,CAN,2022,Canada's National Quantum Strategy,NQS,ORG,,,
2,4,1,CAN,2022,Canada's National Quantum Strategy,Government of Canada,ORG,,,
3,10,1,CAN,2022,Canada's National Quantum Strategy,National Research Council of Canada,ORG,,,
4,10,1,CAN,2022,Canada's National Quantum Strategy,NRC,ORG,,,


In [1]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- 1. CONFIGURATION & DATA LOADING ---
filename = "entities_to_edit.csv"

# --- TAXONOMY (General English) ---
TAXONOMY = {
    "Academia": {
        "Higher Education Institutions (HEIs)": {
            "desc": "Universities, Colleges, Schools (Teaching & Research).",
            "keywords": [
                "University", "College", "School", "Academy", "Faculty", "Department", 
                "Chair", "Campus", "Institute of Technology", "Polytechnic", 
                "Business School", "Medical School", "Law School"
            ]
        },
        "Public Research Orgs (PROs)": {
            "desc": "Research Institutes (Knowledge Output, no teaching).",
            "keywords": [
                "Institute", "Center", "Centre", "Laboratory", "Lab", "Observatory", 
                "National Lab", "Research Council", "Think Tank", "Agency (Research)"
            ]
        }
    },
    "Industry": {
        "Private Firms (Corporates)": {
            "desc": "Established Companies, SMEs, MNEs.",
            "keywords": [
                "Inc", "Corp", "Corporation", "Ltd", "LLC", "PLC", "Co", "Company", 
                "Group", "Holdings", "Manufacturer", "Supplier", "Vendor", 
                "Conglomerate", "Multinational", "Enterprise", "Firm"
            ]
        },
        "Start-ups": {
            "desc": "Young Growth Companies, Spin-offs.",
            "keywords": [
                "Start-up", "Startup", "Spin-off", "Spinoff", "Scale-up", "Unicorn", 
                "Venture", "NewCo", "DeepTech", "Founder", "Stealth Mode"
            ]
        },
        "Consulting": {
            "desc": "Services, Advisory, Legal, HR.",
            "keywords": [
                "Consulting", "Consultancy", "Advisors", "Partners", "Legal", "Law Firm", 
                "LLP", "Attorney", "IP Law", "Patent", "Audit", "Tax", "Recruitment", 
                "Headhunter", "Strategy", "Management", "Services"
            ]
        },
        "Venture Capital / Investors": {
            "desc": "Financial Actors, VCs, Business Angels.",
            "keywords": [
                "Capital", "Invest", "Investment", "Fund", "Venture", "VC", "Equity", 
                "Private Equity", "PE", "Angel", "Seed", "Asset Management", "Bank", 
                "Financial Group", "Holding", "Wealth Management"
            ]
        }
    },
    "Government": {
        "Policy Makers": {
            "desc": "Ministries, Councils, Parliaments (Regulation).",
            "keywords": [
                "Ministry", "Department", "Dept", "Council", "Government", "Federal", 
                "State", "Municipality", "City", "County", "District", "Parliament", 
                "Senate", "Commission", "Mayor", "Governor", "Regulator", "Authority", 
                "Administration", "Bureau"
            ]
        },
        "Funding Agencies": {
            "desc": "Funding Bodies, Project Management Agencies.",
            "keywords": [
                "Foundation", "Agency", "Grant", "Funding", "Fund", "Endowment", 
                "Trust", "Award", "Scholarship", "Fellowship", "Program", "Initiative"
            ]
        }
    },
    "Civil Society": {
        "Media": {
            "desc": "Press, News, Journals.",
            "keywords": [
                "News", "Journal", "Press", "Times", "Post", "Daily", "Review", 
                "Magazine", "Publisher", "Broadcaster", "TV", "Radio", "Podcast", 
                "Blog", "Media", "Outlet", "Chronicle", "Gazette"
            ]
        },
        "Cultural Institutions": {
            "desc": "Museums, Libraries, Galleries.",
            "keywords": [
                "Museum", "Library", "Gallery", "Theater", "Opera", "Orchestra", 
                "Archive", "Collection", "Exhibition", "Zoo", "Botanical Garden", 
                "Science Center", "Planetarium", "Hall"
            ]
        },
        "NGOs / NPOs": {
            "desc": "Non-Profit, Social Goals, Charities.",
            "keywords": [
                "Charity", "Non-Profit", "NPO", "NGO", "Organization", "Society", 
                "Club", "Union", "Alliance", "Federation", "Initiative", "Philanthropy", 
                "Foundation (Private)", "Mission", "Relief"
            ]
        },
        "Intermediaries": {
            "desc": "Clusters, Hubs, TTOs, Chambers.",
            "keywords": [
                "Cluster", "Network", "Hub", "Incubator", "Accelerator", "TTO", 
                "Technology Transfer", "Chamber of Commerce", "Trade Union", 
                "Association", "Consortium", "Standardization", "Body", "Council (Trade)"
            ]
        },
        "Citizens / Users": {
            "desc": "Citizens, Patients, User Groups.",
            "keywords": [
                "Community", "Group", "Public", "Citizen", "Patient", "User", 
                "Resident", "Population", "Crowd", "Forum", "Volunteer", "Advocacy"
            ]
        }
    }
}

try:
    df = pd.read_csv(filename).fillna("")
    required_cols = ["qh_category", "qh_sub_category", "qh_exact_category", "modified_entity_name", "finalized_entity_name"]
    for col in required_cols:
        if col not in df.columns:
            df[col] = ""
except FileNotFoundError:
    print(f"Error: '{filename}' not found.")
    df = pd.DataFrame()

# --- 2. WIDGET SETUP ---

if not df.empty:
    unfinished_indices = df[df['qh_category'] == ""].index.tolist()
    current_idx = unfinished_indices[0] if unfinished_indices else 0
    NEW_OPT = "+++ Create New +++"

    # Progress Bar
    total_items = len(df)
    progress = widgets.IntProgress(
        value=len(df) - len(unfinished_indices),
        min=0,
        max=total_items,
        description='Progress:',
        bar_style='success',
        layout=widgets.Layout(width='99%')
    )
    progress_label = widgets.Label(value=f"{progress.value} / {total_items} tagged")

    # --- INPUT WIDGETS ---

    # 0. Entity Name Editor
    w_name_edit = widgets.Text(
        description='<b>Edit Name:</b>',
        placeholder='Correct the entity name here...',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='600px')
    )

    # 1. Main Category
    w_helix = widgets.Dropdown(
        options=[''] + list(TAXONOMY.keys()) + [NEW_OPT],
        description='1. Helix:',
        layout=widgets.Layout(width='400px')
    )
    w_helix_new = widgets.Text(
        placeholder='Type new Helix category...',
        layout=widgets.Layout(width='400px', display='none') 
    )

    # 2. Sub Category
    w_sub = widgets.Dropdown(
        options=[],
        description='2. Type:',
        layout=widgets.Layout(width='400px'),
        disabled=True
    )
    w_sub_new = widgets.Text(
        placeholder='Type new Sub-Category...',
        layout=widgets.Layout(width='400px', display='none')
    )

    # 3. Exact Category (MULTI SELECT)
    w_exact_multi = widgets.SelectMultiple(
        options=[],
        description='3. Exact:',
        rows=8, # Height of the box
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )
    
    # Helper text for multi-select
    w_multi_help = widgets.HTML(
        value="<div style='font-size:0.8em; color:#666; margin-left:100px;'><i>Hold <b>Ctrl</b> (Win) or <b>Cmd</b> (Mac) to select multiple.</i></div>"
    )

    w_exact_new = widgets.Text(
        placeholder='Type NEW keyword here...',
        layout=widgets.Layout(width='400px')
    )
    # Button to add the typed keyword to the list immediately
    btn_add_exact = widgets.Button(
        description='Add to List',
        icon='plus',
        layout=widgets.Layout(width='100px')
    )

    # Info & Display
    w_info = widgets.HTML(value="<div style='color:#666; font-style:italic; margin-left:100px;'>Select a category...</div>")
    w_entity_display = widgets.HTML()
    output_log = widgets.Output()

    # Buttons
    btn_save = widgets.Button(description='Save & Next ‚û°Ô∏è', button_style='primary')
    btn_prev = widgets.Button(description='‚¨ÖÔ∏è Previous')

    # --- 3. LOGIC ---

    def get_split_history():
        """Reads all exact categories, splits by ';', and returns unique items for the dropdown."""
        all_vals = df['qh_exact_category'].dropna().unique()
        unique_items = set()
        for val in all_vals:
            if str(val) == "nan" or str(val).strip() == "": continue
            parts = [p.strip() for p in str(val).split(';')]
            for p in parts:
                if p: unique_items.add(p)
        return sorted(list(unique_items))

    def update_display():
        """Refreshes UI for current row."""
        if current_idx >= len(df):
            w_entity_display.value = "<div style='background:#d4edda; color:#155724; padding:15px;'><h3>üéâ All Done!</h3></div>"
            return

        row = df.loc[current_idx]
        
        # Display Context
        w_entity_display.value = f"""
        <div style="background-color: #f8f9fa; border-left: 5px solid #0d6efd; padding: 15px; margin-bottom: 10px;">
            <div style="margin-bottom: 5px; font-size: 0.9em; color: #495057;">
                <b>Original Entity:</b> <span style="font-family: monospace; font-size: 1.1em;">{row['entity_name']}</span>
            </div>
            <div style="font-size: 0.85em; color: #666;">
                Doc: {row.get('Document_Name', 'N/A')} | Year: {row.get('Year', 'N/A')}
            </div>
        </div>
        """
        
        # Name Edit
        existing_final = str(row['finalized_entity_name'])
        if existing_final and existing_final.strip() != "" and existing_final != "nan":
            w_name_edit.value = existing_final
        else:
            w_name_edit.value = str(row['entity_name'])

        # Load Categories
        current_cat = row['qh_category']
        current_sub = row['qh_sub_category']
        current_exact = str(row['qh_exact_category'])

        # 1. Set Helix
        if current_cat in w_helix.options:
            w_helix.value = current_cat
        elif current_cat:
            w_helix.options = list(w_helix.options)[:-1] + [current_cat, NEW_OPT]
            w_helix.value = current_cat
        else:
            w_helix.value = ''

        # 2. Set Sub (Trigger updates)
        update_sub_options(w_helix.value)
        if current_sub in w_sub.options:
            w_sub.value = current_sub
        elif current_sub:
            w_sub.options = list(w_sub.options)[:-1] + [current_sub, NEW_OPT]
            w_sub.value = current_sub
            
        # 3. Set Exact (Trigger updates based on Sub)
        update_exact_options(w_helix.value, w_sub.value)
        
        # Handle Pre-selection of Multiple Items
        if current_exact and current_exact != "nan" and current_exact.strip():
            # Split by semicolon
            selected_items = [x.strip() for x in current_exact.split(';')]
            # Ensure they exist in options
            current_options = list(w_exact_multi.options)
            for item in selected_items:
                if item not in current_options and item != "":
                    current_options.append(item)
            
            w_exact_multi.options = sorted(current_options)
            
            # Set Value (must be a tuple of matching strings)
            valid_selection = [x for x in selected_items if x in w_exact_multi.options]
            w_exact_multi.value = tuple(valid_selection)
        else:
            w_exact_multi.value = ()

        # Reset New Fields
        w_helix_new.layout.display = 'none'
        w_sub_new.layout.display = 'none'
        w_exact_new.value = '' # Clear new keyword box

        # Progress
        done_count = len(df[df['qh_category'] != ""])
        progress.value = done_count
        progress_label.value = f"{done_count} / {total_items} tagged"

    def update_sub_options(main_cat):
        if main_cat in TAXONOMY:
            opts = sorted(list(TAXONOMY[main_cat].keys()))
            w_sub.options = [''] + opts + [NEW_OPT]
            w_sub.disabled = False
        elif main_cat and main_cat != NEW_OPT:
            w_sub.options = [''] + [NEW_OPT]
            w_sub.disabled = False
        else:
            w_sub.options = []
            w_sub.disabled = True

    def update_exact_options(main, sub):
        """Populates SelectMultiple with Keywords + Single Item History"""
        options = []
        
        # 1. Add Taxonomy Keywords
        if main in TAXONOMY and sub in TAXONOMY[main]:
            keywords = sorted(TAXONOMY[main][sub].get('keywords', []))
            options += keywords
        
        # 2. Add Global History (Split individual items)
        history = get_split_history()
        # Merge and Unique
        combined = sorted(list(set(options + history)))
        
        w_exact_multi.options = combined

    # --- EVENT HANDLERS ---

    def on_helix_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            val = change['new']
            if val == NEW_OPT:
                w_helix_new.layout.display = 'block'
                w_sub.options = [''] + [NEW_OPT]
                w_sub.disabled = False
                w_info.value = ""
            else:
                w_helix_new.layout.display = 'none'
                update_sub_options(val)
                update_exact_options(val, '')
                w_info.value = ""

    def on_sub_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            val = change['new']
            main = w_helix.value
            
            if val == NEW_OPT:
                w_sub_new.layout.display = 'block'
                w_info.value = ""
            else:
                w_sub_new.layout.display = 'none'
                update_exact_options(main, val)
                
                if main in TAXONOMY and val in TAXONOMY[main]:
                    desc = TAXONOMY[main][val]['desc']
                    w_info.value = f"<div style='color:#0d6efd; margin-left:100px;'>‚ÑπÔ∏è {desc}</div>"

    def add_new_keyword(b):
        """Adds text from input box to the multiple selection list and selects it."""
        new_val = w_exact_new.value.strip()
        if new_val:
            # Add to options
            current_opts = list(w_exact_multi.options)
            if new_val not in current_opts:
                current_opts.append(new_val)
                w_exact_multi.options = sorted(current_opts)
            
            # Add to selection
            current_sel = list(w_exact_multi.value)
            if new_val not in current_sel:
                current_sel.append(new_val)
                w_exact_multi.value = tuple(current_sel)
            
            w_exact_new.value = "" # Clear input

    def save_and_next(b):
        global current_idx
        
        # 1. Name Logic
        original_name = str(df.at[current_idx, 'entity_name']).strip()
        new_name_input = str(w_name_edit.value).strip()
        
        if new_name_input != original_name:
            df.at[current_idx, 'modified_entity_name'] = True
            df.at[current_idx, 'finalized_entity_name'] = new_name_input
            name_log = f"Edited name"
        else:
            df.at[current_idx, 'modified_entity_name'] = False
            df.at[current_idx, 'finalized_entity_name'] = original_name
            name_log = "Name original"

        # 2. Category Logic
        val_cat = w_helix_new.value if w_helix.value == NEW_OPT else w_helix.value
        val_sub = w_sub_new.value if w_sub.value == NEW_OPT else w_sub.value
        
        # Process Exact: Join tuple values with semicolon
        selected_exacts = w_exact_multi.value
        val_exact = "; ".join(selected_exacts)
        
        # Learn Main
        if w_helix.value == NEW_OPT and val_cat:
            if val_cat not in TAXONOMY:
                TAXONOMY[val_cat] = {}
                opts = list(w_helix.options)
                opts.insert(-1, val_cat)
                w_helix.options = opts
        
        # Learn Sub
        if w_sub.value == NEW_OPT and val_sub:
            if val_cat not in TAXONOMY: TAXONOMY[val_cat] = {}
            if val_sub not in TAXONOMY[val_cat]:
                TAXONOMY[val_cat][val_sub] = {'desc': 'User Added', 'keywords': []}
            opts = list(w_sub.options)
            opts.insert(-1, val_sub)
            w_sub.options = opts

        # Exact History Learning is handled automatically next time get_split_history is called on the DF

        # Save to DF
        df.at[current_idx, 'qh_category'] = val_cat
        df.at[current_idx, 'qh_sub_category'] = val_sub
        df.at[current_idx, 'qh_exact_category'] = val_exact
        
        df.to_csv(filename, index=False)
        
        with output_log:
            clear_output(wait=True)
            print(f"‚úÖ Saved. ({name_log})")

        # Next
        if current_idx < len(df) - 1:
            current_idx += 1
            update_display()
        else:
            with output_log:
                print("End of list reached.")

    def go_prev(b):
        global current_idx
        if current_idx > 0:
            current_idx -= 1
            update_display()

    # --- 4. LAYOUT ---
    w_helix.observe(on_helix_change)
    w_sub.observe(on_sub_change)
    btn_add_exact.on_click(add_new_keyword)
    
    btn_save.on_click(save_and_next)
    btn_prev.on_click(go_prev)

    update_display()

    ui = widgets.VBox([
        widgets.HBox([progress, progress_label]),
        w_entity_display,
        
        w_name_edit,
        widgets.HTML("<div style='height:10px;'></div>"),

        w_helix, w_helix_new,
        w_sub, w_sub_new,
        w_info,
        
        widgets.HTML("<hr style='margin: 5px 0; border:0; border-top:1px solid #eee;'>"),
        
        # Exact Multi Select Section
        widgets.VBox([
            w_exact_multi,
            w_multi_help,
            widgets.HBox([w_exact_new, btn_add_exact])
        ]),
        
        widgets.HTML("<hr style='margin: 10px 0;'>"),
        widgets.HBox([btn_prev, btn_save]),
        output_log
    ])
    
    display(ui)

else:
    print("DataFrame is empty.")

VBox(children=(HBox(children=(IntProgress(value=648, bar_style='success', description='Progress:', layout=Layo‚Ä¶