In [4]:
#TASK: Parse PDFs in a directory, extract text and metadata, and save to CSV
import pdfplumber
import pandas as pd
from pathlib import Path

#Iterate over all PDF files in the directory
PDF_DIR = Path("countries_edited")
OUTPUT_CSV = "document.csv"
rows = []
for pdf_path in PDF_DIR.glob("*.pdf"):
    all_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text.append(text)

    full_text = "\n".join(all_text)

#Extract metadata from filename
    stem = pdf_path.stem              
    parts = stem.split("_")

    country = parts[0] if len(parts) > 0 else None
    year = parts[1] if len(parts) > 1 and parts[1].isdigit() else None
    strategy_name = "_".join(parts[2:]) if len(parts) > 2 else None

    rows.append({
        "doc_id": f"{country}_{year}" if year else country,
        "country": country,
        "year": year,
        "strategy_name": strategy_name,
        "file_name": pdf_path.name,
        "text": full_text
    })

df = pd.DataFrame(rows)

# Convert year to numeric, setting errors to NaN for non-numeric values
df["year"] = pd.to_numeric(df["year"], errors="coerce")

df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value


In [15]:
#TASK: Perform NER on the text data and append entities to the CSV
import pandas as pd
import spacy

#Load data
df = pd.read_csv("document.csv")
df["text"] = df["text"].fillna("").astype(str)

#Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Run NER
docs = list(nlp.pipe(df["text"].tolist(), batch_size=32))

#Define NER types to extract
NER_TYPES = ["ORG", "PERSON"]

#Initialize columns for each NER type
for ner in NER_TYPES:
    df[f"entities_{ner}"] = [[] for _ in range(len(df))]
for i, doc in enumerate(docs):
    for ent in doc.ents:
        if ent.label_ in NER_TYPES:
            df.at[i, f"entities_{ent.label_}"].append(ent.text)

# Optional: deduplicate entities per cell
for ner in NER_TYPES:
    df[f"entities_{ner}"] = df[f"entities_{ner}"].apply(
        lambda x: list(dict.fromkeys(x))
    )

# Save to CSV
df.to_csv("document.csv", index=False)

In [14]:
import pandas as pd
import spacy
import ipywidgets as widgets
from IPython.display import display, clear_output
import os

# --- SETTINGS ---
INPUT_FILE = "document.csv"
OUTPUT_FILE = "classified_entities.csv"
NER_TYPES = ["ORG", "GPE", "PERSON", "NORP"]

# 1. LOAD DATA & RUN NER
if not os.path.exists(OUTPUT_FILE):
    nlp = spacy.load("en_core_web_sm")
    df = pd.read_csv(INPUT_FILE)
    df["text"] = df["text"].fillna("").astype(str)
    docs = list(nlp.pipe(df["text"].tolist(), batch_size=32))
    entity_data = []
    for i, doc in enumerate(docs):
        seen_entities = set()
        for ent in doc.ents:
            if ent.label_ in NER_TYPES and ent.text not in seen_entities:
                entity_data.append({
                    "original_doc_index": i,
                    "entity_name": ent.text,
                    "ner_label": ent.label_,
                    "quadruple_helix_category": ""
                })
                seen_entities.add(ent.text)
    df_to_label = pd.DataFrame(entity_data)
    df_to_label.to_csv(OUTPUT_FILE, index=False)
else:
    df_to_label = pd.read_csv(OUTPUT_FILE)

# 2. UI LOGIC
unlabeled = df_to_label[df_to_label["quadruple_helix_category"].isna() | (df_to_label["quadruple_helix_category"] == "")]
current_index = int(unlabeled.index.min() if not unlabeled.empty else len(df_to_label))

# UI Elements
progress_label = widgets.Label(value="")
# New Edit Field
name_input = widgets.Text(description='Correct Name:', style={'description_width': 'initial'})
label_text = widgets.HTML()
output = widgets.Output()

def update_display():
    with output:
        clear_output(wait=True)
        if current_index < len(df_to_label):
            row = df_to_label.iloc[current_index]
            progress_label.value = f"Progress: {current_index}/{len(df_to_label)}"
            name_input.value = row['entity_name'] # Pre-fill with detected name
            label_text.value = f"""
                <div style="border: 2px solid #4A90E2; padding: 10px; border-radius: 8px; background-color: #f9f9f9; margin-bottom: 10px;">
                    <span style="color: #666;">Original NER Label: <b>{row['ner_label']}</b></span>
                </div>
            """
        else:
            progress_label.value = "Done!"
            label_text.value = "<h3>✅ All entities classified!</h3>"
            name_input.layout.display = 'none'

def save_and_next(category):
    global current_index
    if current_index < len(df_to_label):
        # Update both columns: Category AND the potentially edited Name
        df_to_label.at[current_index, 'entity_name'] = name_input.value
        df_to_label.at[current_index, 'quadruple_helix_category'] = category
        df_to_label.to_csv(OUTPUT_FILE, index=False)
        current_index += 1
        update_display()

def go_back(b):
    global current_index
    if current_index > 0:
        current_index -= 1
        update_display()

# Buttons
categories = ["Academia", "Industry", "Government", "Civil Society", "Skip"]
btns = [widgets.Button(description=c, button_style='info' if c != "Skip" else '') for c in categories]
for i, btn in enumerate(btns):
    btn.on_click(lambda b, c=categories[i]: save_and_next(c))

back_btn = widgets.Button(description="⬅ Back", button_style='')
back_btn.on_click(go_back)

# Layout
display(widgets.VBox([
    progress_label,
    label_text,
    name_input, # Display the editable field
    widgets.HBox(btns),
    widgets.HTML("<br>"),
    back_btn,
    output
]))

update_display()

VBox(children=(Label(value=''), HTML(value=''), Text(value='', description='Correct Name:', style=TextStyle(de…