# EDGAR Ground Truth Combined Extraction (Full Context)

**Goal**: Extract all 8 fields from SEC 10-K filings with evidence provenance.

**Model**: Qwen 2.5 32B Instruct via **Transformers** (HuggingFace)

**Strategy**: Per-field sequential extraction with JSON structured output.

**Output**: CSV with `{field}_value`, `{field}_evidence`, `{field}_source_sentence`, `{field}_evidence_verified` for each field.

In [None]:
# 1. Installation (Run once on Lambda) 
#%env HF_TOKEN=your_token
#!pip install -q datasets pandas tqdm thefuzz python-Levenshtein transformers huggingface_hub accelerate bitsandbytes 

In [None]:
# 2. Imports & Setup
import os
import re
import gc
import json
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, logging, BitsAndBytesConfig
from huggingface_hub import login 

logging.set_verbosity_error()
pd.set_option('display.max_colwidth', None) 
login(token=os.environ["HF_TOKEN"]) # or login(token="your_token")
print("Setup Complete.")

In [None]:
# 3. Load Model (Qwen 2.5 32B Instruct via Transformers)

MODEL_NAME = "qwen/Qwen-2.5-32B-Instruct"
DTYPE = torch.bfloat16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_type=torch.bfloat16,
)

print(f"Loading {MODEL_NAME} via Transformers...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    quantization_config=quantization_config,
    device_map="auto",
)
model.eval()

print(f"Model loaded on {torch.cuda.device_count()} GPU(s). Device: {DEVICE}")

In [None]:
# 4. Configuration

OUTPUT_FILE = "edgar_gt_combined_extracted.csv"
BATCH_SIZE = 10  # Save checkpoint every N documents
MAX_DOCUMENTS = 250  # Total documents to process
USE_BATCH_MODE = False  # True = one prompt for all fields

# Generation parameters for deterministic output (replaces SamplingParams)
GENERATION_CONFIG = {
    "max_new_tokens": 500,
    "do_sample": False,
    "temperature": None,  # Greedy decoding when do_sample=False
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id,
}

In [None]:
# 5. Question Bank (All Fields)

QUESTION_BANK = [
    {
        "id": "registrant_name",
        "prompt": (
            "What is the exact legal name of the registrant as explicitly stated in the document? "
            "Return ONLY the legal name string or NOT_FOUND."
        ),
    },
    {
        "id": "headquarters_city",
        "prompt": (
            "What city is explicitly stated as the location of the registrant's principal executive offices? "
            "Return ONLY the city name or NOT_FOUND."
        ),
    },
    {
        "id": "headquarters_state",
        "prompt": (
            "What U.S. state is explicitly stated as the location of the registrant's principal executive offices? "
            "Return ONLY the state name or NOT_FOUND."
        ),
    },
    {
        "id": "incorporation_state",
        "prompt": (
            "Identify the state or other jurisdiction under the laws of which the registrant is **currently** organized or incorporated. "
            "Exclude former jurisdictions. "
            "Return ONLY the state name or NOT_FOUND."
        ),
    },
    {
        "id": "incorporation_year",
        "prompt": (
            "What is the year of the registrant's incorporation in its **current** jurisdiction (state of incorporation)? "
            "Return ONLY the year (YYYY) or NOT_FOUND."
        ),
    },
    {
        "id": "employees_count_total",
        "prompt": (
            "What is the **total** number of persons employed by the registrant? "
            "Include full-time and part-time employees. "
            "Return ONLY the integer (remove commas) or NOT_FOUND."
        ),
    },
    {
        "id": "employees_count_full_time",
        "prompt": (
            "What is the number of **full-time** employees explicitly stated? "
            "Return ONLY the integer (remove commas) or NOT_FOUND."
        ),
    },
    {
        "id": "irs_tax_id",
        "prompt": (
            "What is the IRS Employer Identification Number (EIN) or I.R.S. Employer Identification No. of the registrant? "
            "Return ONLY the number (format XX-XXXXXXX) or NOT_FOUND."
        ),
    },
    {
        "id": "ceo_lastname",
        "prompt": (
            "What is the last name of the individual explicitly identified as the Chief Executive Officer (CEO) of the registrant as explicitly stated in the document? "
            "Return ONLY the last name string or NOT_FOUND."
        ),
    },
    {
        "id": "holder_record_amount",
        "prompt": (
            "What is the number of holders of record of the registrant's common stock as explicitly stated in the document? "
            "Return ONLY the integer (remove commas) or NOT_FOUND."
        ),
    },
]

In [None]:
# 6. Full Context Builder (Concatenates ALL Sections)

SECTION_KEYS = [
    "section_1", "section_1A", "section_1B", "section_2", "section_3",
    "section_4", "section_5", "section_6", "section_7", "section_7A",
    "section_8", "section_9", "section_9A", "section_9B", "section_10",
    "section_11", "section_12", "section_13", "section_14", "section_15"
]
TEXT_COLUMNS = ["full_text"] + SECTION_KEYS

def build_full_context(doc):
    """
    Concatenates ALL available sections from the 10-K filing.
    Returns the full text with section headers for context.
    """
    parts = []
    for key in SECTION_KEYS:
        section_text = doc.get(key, "")
        if section_text and section_text.strip():
            parts.append(f"\n\n--- [{key.upper()}] ---\n\n{section_text}")
    
    return "".join(parts) if parts else ""
def fill_missing_text_columns(df):
    """Fill missing text columns to avoid NaNs in CSVs."""
    for col in TEXT_COLUMNS:
        if col in df.columns:
            df[col] = df[col].fillna("")
    return df

In [None]:
# 7. Extraction Prompt Template (Using Llama 3 chat format via apply_chat_template)

SYSTEM_PROMPT = """You are a precise SEC 10-K filing data extraction assistant. You MUST:
1. Extract information ONLY from the provided text.
2. If the information is not found, return "NOT_FOUND" as the value, and for evidence provide your reasoning on why it was not found.
3. Always provide the exact quote from the text as evidence. 
4. Respond ONLY with valid JSON, nothing else."""

USER_TEMPLATE = """Read this SEC 10-K filing and answer the question.

**Question**: {question}

**Instructions**:
- Provide your answer as a JSON object with exactly these keys:
  - "value": The extracted answer (or "NOT_FOUND" if not present)
  - "evidence": If value is found, the EXACT substring from the text. If NOT_FOUND, explain WHY it could not be found.
  - "source_sentence": The complete sentence containing the evidence (or "N/A" if NOT_FOUND)

**10-K Filing Text**:
{context}"""

BATCH_USER_TEMPLATE = """Read this SEC 10-K filing and extract ALL of the fields listed below.

**Fields to Extract**:
{questions}

**Instructions**:
- Return a SINGLE JSON object.
- For each field, include these keys:
  - "{field_id}_value"
  - "{field_id}_evidence"
  - "{field_id}_source_sentence"
- If a value is not found, set "{field_id}_value" to "NOT_FOUND",
  set "{field_id}_evidence" to a brief reason, and "{field_id}_source_sentence" to "N/A".

**10-K Filing Text**:
{context}"""

In [None]:
# 8. LLM Extraction Class (Transformers version)

class EDGARExtractor:
    def __init__(self, model, tokenizer, generation_config):
        self.model = model
        self.tokenizer = tokenizer
        self.generation_config = generation_config

    def _build_prompt(self, prompt_text):
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt_text},
        ]
        return self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

    def _generate(self, prompt):
        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                **self.generation_config
            )
        return self.tokenizer.decode(
            outputs[0][inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        ).strip()

    def _parse_json(self, response_text, question_id=None, is_batch=False):
        try:
            if is_batch:
                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            else:
                json_match = re.search(r'\{[^{}]*\}', response_text, re.DOTALL)
            json_str = json_match.group(0) if json_match else response_text
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            label = question_id or "batch"
            print(f"  [JSON ERROR] {label}: {e}")
            print(f"  Raw response: {response_text[:200]}...")
            return None

    def extract_single(self, full_text, question_config):
        if not full_text or not full_text.strip():
            return "NOT_FOUND", "NO_CONTEXT", "NO_CONTEXT"
        prompt_text = USER_TEMPLATE.format(
            question=question_config["prompt"],
            context=full_text
        )
        prompt = self._build_prompt(prompt_text)
        response_text = self._generate(prompt)
        data = self._parse_json(response_text, question_id=question_config["id"], is_batch=False)
        if data is None:
            return "JSON_PARSE_ERROR", response_text[:500], "JSON_PARSE_ERROR"
        value = data.get("value", "PARSE_ERROR")
        evidence = data.get("evidence", "PARSE_ERROR")
        source_sentence = data.get("source_sentence", "PARSE_ERROR")
        if value:
            value = str(value).strip().rstrip('.')
        return value, evidence, source_sentence

    def extract_batch(self, full_text, question_bank):
        if not full_text or not full_text.strip():
            return None
        questions = "\n".join([f"- {q['id']}: {q['prompt']}" for q in question_bank])
        prompt_text = BATCH_USER_TEMPLATE.format(
            questions=questions,
            field_id="{field_id}",
            context=full_text
        )
        prompt = self._build_prompt(prompt_text)
        response_text = self._generate(prompt)
        return self._parse_json(response_text, is_batch=True)

In [None]:
# 9. Evidence Verification ("Judge" Logic)

def get_fingerprint(text):
    """Removes all non-alphanumeric characters for fuzzy matching."""
    return re.sub(r'[\W_]+', '', text).lower()

def verify_evidence(full_text, evidence_quote, value):
    """
    Checks if the evidence quote actually exists in the full text.
    Returns: 
        - True if verified (evidence found in text)
        - False if not found (potential hallucination)
        - "NOT_APPLICABLE" if value was NOT_FOUND (evidence is reasoning, not a quote)
    """
    # If value is NOT_FOUND, evidence is reasoning - don't try to verify it exists
    if value and str(value).upper() == "NOT_FOUND":
        return "NOT_APPLICABLE"
    
    if not evidence_quote or evidence_quote in ["NO_CONTEXT", "PARSE_ERROR", "JSON_PARSE_ERROR", "N/A"]:
        return None  # Missing data
    
    if not full_text:
        return False
    
    # 1. Exact match
    if evidence_quote in full_text:
        return True
    
    # 2. Normalized match (ignore whitespace/punctuation differences)
    clean_text = " ".join(full_text.split()).lower()
    clean_evd = " ".join(evidence_quote.split()).lower()
    
    if clean_evd in clean_text:
        return True
    
    # 3. Fingerprint match (ignore all punctuation)
    fp_text = get_fingerprint(full_text)
    fp_evd = get_fingerprint(evidence_quote)
    
    if len(fp_evd) > 10 and fp_evd in fp_text:
        return True
    
    return False

In [None]:
# 10. Load Dataset (from existing notebook pattern)

def load_edgar_dataset():
    """Load the EDGAR corpus with streaming."""
    return load_dataset(
        "c3po-ai/edgar-corpus",
        "default",
        split="train",
        streaming=True,
        revision="refs/convert/parquet",
    )

In [None]:
# 11. Process Single Document

def process_document(doc, extractor):
    """
    Extracts all fields from a single document.
    Returns a dict with all columns for the output CSV.
    """
    result = {
        "filename": doc.get("filename"),
        "cik": doc.get("cik"),
        "year": doc.get("year"),
    }
    for key in SECTION_KEYS:
        result[key] = doc.get(key, "")
    
    # Build full context once
    full_text = build_full_context(doc)
    result["full_text"] = full_text  # Store for manual review
    
    def is_found(value):
        return str(value) not in [
            "NOT_FOUND", "NO_CONTENT", "NO_CONTEXT", "JSON_PARSE_ERROR", "PARSE_ERROR", "ERROR", "None"
        ]
    
    if not full_text:
        # No content - mark all fields as not found
        for q in QUESTION_BANK:
            field_id = q["id"]
            result[f"{field_id}_value"] = "NO_CONTENT"
            result[f"{field_id}_evidence"] = "NO_CONTENT"
            result[f"{field_id}_source_sentence"] = "NO_CONTENT"
            result[f"{field_id}_evidence_verified"] = None
            result[f"{field_id}_found"] = False
        return result
    
    if USE_BATCH_MODE:
        batch_data = extractor.extract_batch(full_text, QUESTION_BANK)
        if not batch_data:
            for q in QUESTION_BANK:
                field_id = q["id"]
                result[f"{field_id}_value"] = "ERROR"
                result[f"{field_id}_evidence"] = "ERROR"
                result[f"{field_id}_source_sentence"] = "ERROR"
                result[f"{field_id}_evidence_verified"] = None
                result[f"{field_id}_found"] = False
            return result
        for q in QUESTION_BANK:
            field_id = q["id"]
            value = batch_data.get(f"{field_id}_value", "NOT_FOUND")
            evidence = batch_data.get(f"{field_id}_evidence", "NOT_FOUND")
            source_sentence = batch_data.get(f"{field_id}_source_sentence", "NOT_FOUND")
            evidence_verified = verify_evidence(full_text, evidence, value)
            result[f"{field_id}_value"] = value
            result[f"{field_id}_evidence"] = evidence
            result[f"{field_id}_source_sentence"] = source_sentence
            result[f"{field_id}_evidence_verified"] = evidence_verified
            result[f"{field_id}_found"] = is_found(value)
        return result
    
    # Extract each field (single mode)
    for question in QUESTION_BANK:
        field_id = question["id"]
        print(f"Extracting {field_id} from {result['filename']}...")
        value, evidence, source_sentence = extractor.extract_single(full_text, question)
        # Verify evidence exists in text 
        print(f"  Verifying evidence for {field_id} from {result['filename']}...")
        evidence_verified = verify_evidence(full_text, evidence, value)
        result[f"{field_id}_value"] = value
        result[f"{field_id}_evidence"] = evidence
        result[f"{field_id}_source_sentence"] = source_sentence
        result[f"{field_id}_evidence_verified"] = evidence_verified
        result[f"{field_id}_found"] = is_found(value)
    
    return result

In [None]:
# 12. Main Extraction Loop

def run_extraction(
    output_file=OUTPUT_FILE,
    limit=MAX_DOCUMENTS,
    batch_size=BATCH_SIZE,
):
    """
    Main extraction loop with resume support and batch checkpointing.
    """
    print(f"--- COMBINED EXTRACTION: {limit} documents ---")
    extractor = EDGARExtractor(model, tokenizer, GENERATION_CONFIG)
    
    # 1. Resume support: Load existing progress
    if os.path.exists(output_file):
        df_results = pd.read_csv(output_file)
        df_results = fill_missing_text_columns(df_results)
        processed_files = set(df_results["filename"].tolist())
        print(f"Resuming: {len(processed_files)} documents already processed.")
    else:
        df_results = pd.DataFrame()
        processed_files = set()
    
    # 2. Load dataset
    dataset = load_edgar_dataset()
    
    current_batch = []
    total_processed = len(processed_files)
    new_processed = 0
    
    # 3. Main loop
    for doc in tqdm(dataset, desc="Extracting", total=limit):
        fname = doc.get("filename")
        
        # Skip if already processed
        if fname in processed_files:
            continue
        
        # Limit check
        if total_processed + new_processed >= limit:
            break
        
        # Process document
        result = process_document(doc, extractor)
        current_batch.append(result)
        new_processed += 1
        
        # Batch checkpoint
        if len(current_batch) >= batch_size:
            df_batch = pd.DataFrame(current_batch)
            df_batch = fill_missing_text_columns(df_batch)
            df_results = pd.concat([df_results, df_batch], ignore_index=True)
            df_results.to_csv(output_file, index=False)
            current_batch = []
            
            # Memory cleanup
            gc.collect()
            torch.cuda.empty_cache()
            print(f"  [Checkpoint] Saved {total_processed + new_processed}/{limit} docs.")
    
    # 4. Final save
    if current_batch:
        df_batch = pd.DataFrame(current_batch)
        df_batch = fill_missing_text_columns(df_batch)
        df_results = pd.concat([df_results, df_batch], ignore_index=True)
        df_results.to_csv(output_file, index=False)
    
    print(f"--- EXTRACTION COMPLETE: {output_file} ---")
    return df_results

In [None]:
# 13. Run Extraction

df_extracted = run_extraction(
    output_file=OUTPUT_FILE,
    limit=MAX_DOCUMENTS,
    batch_size=BATCH_SIZE,
)

# Display sample results
print(f"\nTotal rows: {len(df_extracted)}")
display(df_extracted.head())

In [None]:
# 14. Quality Report

def generate_quality_report(df):
    """Generate a summary of extraction quality."""
    print("\n=== EXTRACTION QUALITY REPORT ===")
    print(f"Total documents: {len(df)}")
    print()
    
    for q in QUESTION_BANK:
        field_id = q["id"]
        value_col = f"{field_id}_value"
        verified_col = f"{field_id}_evidence_verified"
        
        if value_col not in df.columns:
            continue
        
        total = len(df)
        found = len(df[~df[value_col].isin(["NOT_FOUND", "NO_CONTENT", "JSON_PARSE_ERROR"])])
        if verified_col in df.columns:
            verified = df[verified_col].sum() if df[verified_col].dtype == bool else 0
        else:
            verified = "N/A"
        
        print(f"{field_id}:")
        print(f"  Found: {found}/{total} ({100*found/total:.1f}%)")
        print(f"  Evidence Verified: {verified}")
        print()

if not df_extracted.empty:
    generate_quality_report(df_extracted)

In [None]:
# 15. Inspect Specific Results (by row number)

ROW_NUMBER = 0  # Change this to inspect different rows

if not df_extracted.empty and ROW_NUMBER < len(df_extracted):
    row = df_extracted.iloc[ROW_NUMBER]
    
    print(f"=== ROW {ROW_NUMBER}: {row['filename']} ===")
    print(f"CIK: {row['cik']} | Year: {row['year']}")
    print()
    
    for q in QUESTION_BANK:
        field_id = q["id"]
        print(f"--- {field_id} ---")
        print(f"  Value: {row[f'{field_id}_value']}")
        print(f"  Evidence: {row[f'{field_id}_evidence']}")
        print(f"  Verified: {row[f'{field_id}_evidence_verified']}")
        print()
else:
    print(f"Row {ROW_NUMBER} not found. DataFrame has {len(df_extracted)} rows.")


In [None]:

ROW_TO_INSPECT = 0  # Change this to view different rows
if not df_extracted.empty and ROW_TO_INSPECT < len(df_extracted):
    row = df_extracted.iloc[ROW_TO_INSPECT]
    
    print(f"=== FULL TEXT: {row['filename']} ===")
    print(f"Length: {len(row['full_text'])} characters")
    print("=" * 60)
    print(row['full_text'])
else:
    print(f"Row {ROW_TO_INSPECT} not found.")

In [None]:
def split_combined_df_by_feature(
    df,
    output_dir="./split_features/",
    id_columns=None,
):
    """
    Split a combined extraction DataFrame into separate CSVs per feature.
    
    Args:
        df: The master DataFrame containing all features.
        output_dir: Directory to save the split CSV files.
        id_columns: List of identity columns to keep in each file.
                    Default: ["filename", "cik", "year"]
    
    Returns:
        Dict mapping feature_id -> output file path.
    
    Example Output Files:
        - split_features/extracted_registrant_name.csv
        - split_features/extracted_employee_count.csv
        - ...
    """
    if id_columns is None:
        id_columns = ["filename", "cik", "year"]
    
    # Create output directory if needed
    os.makedirs(output_dir, exist_ok=True)
    
    # Discover which features are in the DataFrame
    # Pattern: {feature_id}_value, {feature_id}_evidence, etc.
    feature_ids = set()
    for col in df.columns:
        if col.endswith("_value"):
            feature_id = col.replace("_value", "")
            feature_ids.add(feature_id)
    
    if not feature_ids:
        print("No feature columns found in DataFrame. Looking for *_value columns.")
        return {}
    
    print(f"Found {len(feature_ids)} features to split: {sorted(feature_ids)}")
    
    output_files = {}
    
    for feature_id in sorted(feature_ids):
        # Define column patterns for this feature
        feature_cols = [
            f"{feature_id}_value",
            f"{feature_id}_evidence",
            f"{feature_id}_source_sentence",
            f"{feature_id}_evidence_verified",
        ]
        
        # Filter to columns that actually exist
        existing_feature_cols = [c for c in feature_cols if c in df.columns]
        
        if not existing_feature_cols:
            print(f"  [SKIP] {feature_id}: No columns found.")
            continue
        
        # Select identity + feature columns
        cols_to_keep = [c for c in id_columns if c in df.columns] + existing_feature_cols
        
        df_feature = df[cols_to_keep].copy()
        
        # Output path
        output_path = os.path.join(output_dir, f"extracted_{feature_id}.csv")
        
        df_feature.to_csv(output_path, index=False)
        output_files[feature_id] = output_path
        
        # Summary
        value_col = f"{feature_id}_value"
        total_rows = len(df_feature)
        found_count = len(df_feature[~df_feature[value_col].isin(["NOT_FOUND", "NO_CONTENT", "JSON_PARSE_ERROR", None])])
        
        print(f"  âœ“ {feature_id}: {output_path}")
        print(f"      Rows: {total_rows} | Found: {found_count} ({100*found_count/total_rows:.1f}%)")
    
    print(f"\n--- SPLIT COMPLETE: {len(output_files)} files written to {output_dir} ---")
    return output_files

In [None]:
# --- Split Combined DataFrame ---
# After running full extraction, split into separate files:
#
if not df_extracted.empty:
    output_paths = split_combined_df_by_feature(
        df=df_extracted,
        output_dir="./split_features/",
    )
    print(output_paths)

# EDGAR Ground Truth Singular Extraction (Full Context)

**Goal**: Extract 1 field from SEC 10-K filings with evidence provenance (using all sections this time).

**Model**: Llama 3.3 70B Instruct via **Transformers** (HuggingFace)

**Strategy**: Per-field sequential extraction with JSON structured output.

**Output**: CSV with `{field}_value`, `{field}_evidence`, `{field}_source_sentence`, `{field}_evidence_verified` the field.

In [None]:

# **Purpose**: Extract a single feature from QUESTION_BANK to a CSV, with 
# append/merge support like `discover_new_feature` in the single notebook.
#
# Uses the existing Transformers setup (model, tokenizer, GENERATION_CONFIG) from this notebook.
# ==============================================================================

def merge_and_save_to_csv(df_master, batch_data, output_path):
    """
    Merge new feature columns into existing master DataFrame.
    - If filename exists: Add/update the feature columns for that row.
    - If filename is new: Append the entire row.
    
    Adapted from extract_edgar_gt_single.ipynb's merge_and_save.
    """
    df_batch = pd.DataFrame(batch_data)
    
    if df_master.empty:
        df_master = df_batch
    else:
        # Normalize merge key to string
        df_master["filename"] = df_master["filename"].astype(str)
        df_batch["filename"] = df_batch["filename"].astype(str)
        
        # Drop identity columns from batch to prevent duplicates
        batch_feature_cols = [col for col in df_batch.columns if col not in ["filename", "cik", "year"]]
        df_batch_slim = df_batch[["filename"] + batch_feature_cols]
        
        # Merge on filename (outer join to add new filenames)
        df_master = pd.merge(
            df_master,
            df_batch_slim,
            on="filename",
            how="outer",
            suffixes=("", "_new")
        )
        
        # Fill in cik/year for new rows
        # Fixed: Use vectorized mapping to avoid .loc assignment type errors and loops
        if "cik" in df_master.columns and df_master["cik"].isna().any():
            cik_map = df_batch.set_index("filename")["cik"]
            df_master["cik"] = df_master["cik"].fillna(df_master["filename"].map(cik_map))
            
        if "year" in df_master.columns and df_master["year"].isna().any():
            year_map = df_batch.set_index("filename")["year"]
            df_master["year"] = df_master["year"].fillna(df_master["filename"].map(year_map))
        
        # Cleanup duplicate columns (e.g., 'value_new' -> merge into 'value')
        for col in list(df_master.columns):
            if col.endswith("_new"):
                base_col = col.replace("_new", "")
                if base_col in df_master.columns:
                    df_master[base_col] = df_master[base_col].fillna(df_master[col])
                else:
                    df_master.rename(columns={col: base_col}, inplace=True)
                if col in df_master.columns:
                    df_master.drop(columns=[col], inplace=True)
    
    df_master.to_csv(output_path, index=False)
    return df_master


In [None]:

def run_single_feature_extraction(
    feature_id,
    output_file="single_feature_extracted.csv",
    limit=MAX_DOCUMENTS,
    batch_size=BATCH_SIZE,
):
    """
    Extract a SINGLE feature from QUESTION_BANK using Transformers.
    
    Supports:
    - **Resume**: Skips documents already processed for this feature.
    - **Merge**: If file exists with other features, adds new columns.
    - **Append**: If file exists but new documents are processed, appends rows.
    
    Args:
        feature_id: The 'id' of the question in QUESTION_BANK to extract.
        output_file: Path to the output CSV (will be created or merged into).
        limit: Maximum number of documents to process.
        batch_size: Checkpoint interval (saves every N docs).
    
    Returns:
        DataFrame with extraction results.
    """
    print(f"--- SINGLE FEATURE EXTRACTION: {feature_id} ---")
    extractor = EDGARExtractor(model, tokenizer, GENERATION_CONFIG)
    
    # 1. Validate feature_id exists in QUESTION_BANK
    question_config = None
    for q in QUESTION_BANK:
        if q["id"] == feature_id:
            question_config = q
            break
    
    if question_config is None:
        raise ValueError(f"Feature '{feature_id}' not found in QUESTION_BANK. Available: {[q['id'] for q in QUESTION_BANK]}")
    
    print(f"  Question: {question_config['prompt'][:80]}...")
    
    # 2. Column names for this feature
    value_col = f"{feature_id}_value"
    evidence_col = f"{feature_id}_evidence"
    source_col = f"{feature_id}_source_sentence"
    verified_col = f"{feature_id}_evidence_verified"
    
    # 3. Load existing progress (resume support)
    if os.path.exists(output_file):
        df_master = pd.read_csv(output_file)
        # Check if this feature has already been processed for some docs
        if value_col in df_master.columns:
            processed_files = set(
                df_master[df_master[value_col].notna()]["filename"].tolist()
            )
            print(f"  Resuming: {len(processed_files)} documents already have '{feature_id}'.")
        else:
            processed_files = set()
            print(f"  File exists, but '{feature_id}' is a NEW feature. Will merge columns.")
    else:
        df_master = pd.DataFrame()
        processed_files = set()
        print(f"  Starting fresh extraction.")
    
    # 4. Load dataset
    dataset = load_edgar_dataset()
    
    current_batch = []
    total_processed = len(processed_files)
    new_processed = 0
    
    # 5. Main extraction loop
    for doc in tqdm(dataset, desc=f"Extracting {feature_id}", total=limit):
        fname = doc.get("filename")
        
        # Skip if already processed
        if fname in processed_files:
            continue
        
        # Limit check
        if total_processed + new_processed >= limit:
            break
        
        # Build full context
        full_text = build_full_context(doc)
        
        # Extract this single field
        value, evidence, source_sentence = extractor.extract_single(
            full_text, question_config
        )
        
        # Verify evidence
        evidence_verified = verify_evidence(full_text, evidence, value)
        
        # Build result row
        row = {
            "filename": fname,
            "cik": doc.get("cik"),
            "year": doc.get("year"),
            value_col: value,
            evidence_col: evidence,
            source_col: source_sentence,
            verified_col: evidence_verified,
        }
        
        current_batch.append(row)
        new_processed += 1
        
        # Batch checkpoint
        if len(current_batch) >= batch_size:
            df_master = merge_and_save_to_csv(df_master, current_batch, output_file)
            current_batch = []
            
            gc.collect()
            torch.cuda.empty_cache()
            print(f"  [Checkpoint] Saved {total_processed + new_processed}/{limit} docs.")
    
    # 6. Final save
    if current_batch:
        df_master = merge_and_save_to_csv(df_master, current_batch, output_file)
    
    print(f"--- SINGLE FEATURE EXTRACTION COMPLETE: {output_file} ---")
    print(f"  Total documents with '{feature_id}': {len(df_master[df_master[value_col].notna()]) if value_col in df_master.columns else 0}")
    
    return df_master

In [None]:
# --- Single Feature Extraction ---
# Extract ONLY 'employee_count' to a file (with merge/append support):
#
df_single = run_single_feature_extraction(
    feature_id="employees_count_total",
    output_file="employee_count_only.csv",
    limit=100,
    batch_size=10,
)