# EDGAR Ground Truth Combined Extraction (Full Context)

**Goal**: Extract all 8 fields from SEC 10-K filings with evidence provenance.

**Model**: Llama 3.3 70B Instruct via vLLM on Lambda GPU

**Strategy**: Per-field sequential extraction with JSON structured output.

**Output**: CSV with `{field}_value`, `{field}_evidence`, `{field}_source_sentence`, `{field}_evidence_verified` for each field.

In [None]:
# 1. Installation (Run once on Lambda) 
#!export HF_TOKEN="your_token"
#!pip install -q vllm datasets pandas tqdm thefuzz python-Levenshtein

In [None]:
# 2. Imports & Setup
import os
import re
import gc
import json
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from vllm import LLM, SamplingParams

pd.set_option('display.max_colwidth', None)
print("Setup Complete.")

In [None]:
# 3. Load Model (Llama 3.3 70B Instruct via vLLM)

MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"

print(f"Loading {MODEL_NAME} via vLLM...")
llm = LLM(
    model=MODEL_NAME,
    tensor_parallel_size=torch.cuda.device_count(),  # Use all available GPUs
    max_model_len=65536,  # 64K context to fit ~50-60K docs with room for output
    enable_prefix_caching=True,  # Reduces attention dilution
    gpu_memory_utilization=0.90,
    dtype="bfloat16",
)
print(f"Model loaded on {torch.cuda.device_count()} GPU(s).")

In [None]:
# 4. Configuration

OUTPUT_FILE = "edgar_gt_combined_extracted.csv"
BATCH_SIZE = 10  # Save checkpoint every N documents
MAX_DOCUMENTS = 250  # Total documents to process

# Sampling parameters for deterministic output
SAMPLING_PARAMS = SamplingParams(
    temperature=0.0,
    max_tokens=500,
    stop=["}"],  # Stop after JSON closes
)

In [None]:
# 5. Question Bank (All 8 Fields)

QUESTION_BANK = [
    {
        "id": "registrant_name",
        "prompt": (
            "What is the exact legal name of the registrant? "
            "1. Look for the very first sentence of the 'Business' section or the cover page intro "
            "(e.g., 'Apple Inc. (the Registrant)...'). "
            "2. Do NOT use 'Doing Business As' (DBA) names or brand names. "
            "3. Do NOT include the stock ticker symbol. "
            "4. Include legal suffixes like 'Inc.', 'Corp.', 'Ltd.' if present. "
            "Answer with ONLY the legal name string."
        ),
    },
    {
        "id": "headquarters_city",
        "prompt": (
            "In which city are the registrant's *principal executive offices* physically located? "
            "1. Look for the address under 'Executive Offices' or 'Address of Principal Executive Offices'. "
            "2. CRITICAL WARNING: Do NOT return the city of the 'Registered Agent' or 'State of Incorporation' "
            "(e.g., ignore 'Wilmington' or 'Dover' unless the CEO actually works there). "
            "3. Ignore P.O. Boxes. "
            "Answer with ONLY the city name."
        ),
    },
    {
        "id": "headquarters_state",
        "prompt": (
            "In which U.S. state are the registrant's *principal executive offices* physically located? "
            "1. This is the state where the HQ building is, NOT necessarily the state of incorporation. "
            "2. CRITICAL: If the text says 'Incorporated in Delaware' but 'Executive offices in California', "
            "return CALIFORNIA. "
            "Answer with ONLY the state name."
        ),
    },
    {
        "id": "original_incorporation_state",
        "prompt": (
            "In which U.S. state was the registrant *originally* incorporated or organized? "
            "Follow this strict hierarchy: "
            "1. PRIORITIZE HISTORY: Look for phrases like 'originally incorporated in', 'formerly organized in', "
            "or 'predecessor company incorporated in'. "
            "2. REINCORPORATION RULE: If the company reincorporated (e.g., moved from California to Delaware), "
            "you MUST return the OLD state (California), not the current one. "
            "3. MERGER EXCEPTION: Only if the registrant is a *new* successor entity formed by a merger, "
            "return the state of that successor. "
            "4. If no history is mentioned, return the current state. "
            "Answer with ONLY the state name."
        ),
    },
    {
        "id": "original_incorporation_year",
        "prompt": (
            "In which year was the registrant *originally* incorporated or organized? "
            "1. IGNORE 'FOUNDED' dates. Only look for 'incorporated', 'organized', or 'formed'. "
            "2. REINCORPORATION RULE: If the text says 'originally incorporated in 1980' and 'reincorporated in 1995', "
            "return the EARLIEST year (1980). "
            "3. MERGER EXCEPTION: If the current entity was formed by a merger of equals, use the year of that merger. "
            "Answer with ONLY the year (YYYY)."
        ),
    },
    {
        "id": "employee_count",
        "prompt": (
            "What is the total number of employees the registrant has? "
            "1. PREFER FULL-TIME: If the text distinguishes between full-time and part-time, return the full-time count. "
            "2. If only 'total' is given, use that. "
            "3. EXCLUDE: Do not count independent contractors, agents, or temporary staff unless they are the only number given. "
            "4. FORMAT: Remove commas and return ONLY the integer (e.g., return 14500, not 14,500). "
            "If the number is 'approximately 5,000', return 5000."
        ),
    },
    {
        "id": "ceo_lastname",
        "prompt": (
            "What is the LAST NAME of the registrant's current Chief Executive Officer (CEO)? "
            "1. Look for 'Chief Executive Officer', 'CEO', or 'Principal Executive Officer'. "
            "2. If 'Co-CEOs' are listed, pick the first one mentioned. "
            "3. EXCLUDE titles (Mr., Dr.) and first/middle names. "
            "4. If the CEO has a compound last name (e.g., 'Von Trap'), include the full last name. "
            "Answer with ONLY the last name string."
        ),
    },
    {
        "id": "holder_record_amount",
        "prompt": (
            "What is the number of **holders of record** of the registrant's common stock? "
            "1. KEYWORDS: Look for 'holders of record', 'shareholders of record', or 'record holders' in Item 5. "
            "2. WHOLLY-OWNED RULE: If the text states the stock is 'wholly-owned', 'held solely by', or 'all outstanding stock is held by' a parent company, return **1**. "
            "3. EXCLUDE BENEFICIAL OWNERS: Do not use counts of 'beneficial owners' or shares held in 'street name' unless strictly no other number exists. "
            "4. MULTIPLE CLASSES: If Class A and Class B Common Stock are listed, SUM the record holders. Ignore Preferred Stock. "
            "5. DATE PRIORITY: If multiple dates are provided (e.g., 'as of year-end' vs 'as of March 31'), choose the **most recent** date. "
            "Return ONLY the integer (e.g., 4530). Remove commas and words like 'approximately'."
        ),
    },
]

In [None]:
# 6. Full Context Builder (Concatenates ALL Sections)

SECTION_KEYS = [
    "section_1", "section_1A", "section_1B", "section_2", "section_3",
    "section_4", "section_5", "section_6", "section_7", "section_7A",
    "section_8", "section_9", "section_9A", "section_9B", "section_10",
    "section_11", "section_12", "section_13", "section_14", "section_15"
]

def build_full_context(doc):
    """
    Concatenates ALL available sections from the 10-K filing.
    Returns the full text with section headers for context.
    """
    parts = []
    for key in SECTION_KEYS:
        section_text = doc.get(key, "")
        if section_text and section_text.strip():
            parts.append(f"\n\n--- [{key.upper()}] ---\n\n{section_text}")
    
    return "".join(parts) if parts else ""

In [None]:
# 7. Extraction Prompt Template

EXTRACTION_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a precise SEC 10-K filing data extraction assistant. You MUST:
1. Extract information ONLY from the provided text.
2. If the information is not found, return "NOT_FOUND" as the value.
3. Always provide the exact quote from the text as evidence.
4. Respond ONLY with valid JSON, nothing else.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Read this SEC 10-K filing and answer the question.

**Question**: {question}

**Instructions**:
- Provide your answer as a JSON object with exactly these keys:
  - "value": The extracted answer (or "NOT_FOUND" if not present)
  - "evidence": The EXACT substring from the text that supports your answer (copy word-for-word)
  - "source_sentence": The complete sentence containing the evidence

**10-K Filing Text**:
{context}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{{"""

In [None]:
# 8. LLM Extraction Function

def extract_field(full_text, question_config, llm, sampling_params):
    """
    Extracts a single field from the full text using the LLM.
    Returns: (value, evidence, source_sentence)
    """
    if not full_text or not full_text.strip():
        return "NOT_FOUND", "NO_CONTEXT", "NO_CONTEXT"
    
    prompt = EXTRACTION_TEMPLATE.format(
        question=question_config["prompt"],
        context=full_text
    )
    
    # Generate with vLLM
    outputs = llm.generate([prompt], sampling_params)
    response_text = outputs[0].outputs[0].text.strip()
    
    # Parse JSON response
    try:
        # The response starts after we injected '{"', so prepend it back
        json_str = '{"' + response_text
        if not json_str.endswith('}'):
            json_str += '}'
        
        data = json.loads(json_str)
        value = data.get("value", "PARSE_ERROR")
        evidence = data.get("evidence", "PARSE_ERROR")
        source_sentence = data.get("source_sentence", "PARSE_ERROR")
        
        # Clean up value
        if value:
            value = str(value).strip().rstrip('.')
        
        return value, evidence, source_sentence
        
    except json.JSONDecodeError as e:
        print(f"  [JSON ERROR] {question_config['id']}: {e}")
        print(f"  Raw response: {response_text[:200]}...")
        return "JSON_PARSE_ERROR", response_text[:500], "JSON_PARSE_ERROR"

In [None]:
# 9. Evidence Verification ("Judge" Logic)

def get_fingerprint(text):
    """Removes all non-alphanumeric characters for fuzzy matching."""
    return re.sub(r'[\W_]+', '', text).lower()

def verify_evidence(full_text, evidence_quote):
    """
    Checks if the evidence quote actually exists in the full text.
    Returns: True if verified, False if not found (potential hallucination)
    """
    if not evidence_quote or evidence_quote in ["NOT_FOUND", "NO_CONTEXT", "PARSE_ERROR", "JSON_PARSE_ERROR"]:
        return None  # Not applicable
    
    if not full_text:
        return False
    
    # 1. Exact match
    if evidence_quote in full_text:
        return True
    
    # 2. Normalized match (ignore whitespace/punctuation differences)
    clean_text = " ".join(full_text.split()).lower()
    clean_evd = " ".join(evidence_quote.split()).lower()
    
    if clean_evd in clean_text:
        return True
    
    # 3. Fingerprint match (ignore all punctuation)
    fp_text = get_fingerprint(full_text)
    fp_evd = get_fingerprint(evidence_quote)
    
    if len(fp_evd) > 10 and fp_evd in fp_text:
        return True
    
    return False

In [None]:
# 10. Load Dataset (from existing notebook pattern)

def load_edgar_dataset():
    """Load the EDGAR corpus with streaming."""
    return load_dataset(
        "c3po-ai/edgar-corpus",
        "default",
        split="train",
        streaming=True,
        revision="refs/convert/parquet",
    )

In [None]:
# 11. Process Single Document

def process_document(doc, llm, sampling_params):
    """
    Extracts all 8 fields from a single document.
    Returns a dict with all columns for the output CSV.
    """
    result = {
        "filename": doc.get("filename"),
        "cik": doc.get("cik"),
        "year": doc.get("year"),
    }
    
    # Build full context once
    full_text = build_full_context(doc)
    result["full_text"] = full_text  # Store for manual review
    
    if not full_text:
        # No content - mark all fields as not found
        for q in QUESTION_BANK:
            field_id = q["id"]
            result[f"{field_id}_value"] = "NO_CONTENT"
            result[f"{field_id}_evidence"] = "NO_CONTENT"
            result[f"{field_id}_source_sentence"] = "NO_CONTENT"
            result[f"{field_id}_evidence_verified"] = None
        return result
    
    # Extract each field
    for question in QUESTION_BANK:
        field_id = question["id"]
        
        value, evidence, source_sentence = extract_field(
            full_text, question, llm, sampling_params
        )
        
        # Verify evidence exists in text
        evidence_verified = verify_evidence(full_text, evidence)
        
        result[f"{field_id}_value"] = value
        result[f"{field_id}_evidence"] = evidence
        result[f"{field_id}_source_sentence"] = source_sentence
        result[f"{field_id}_evidence_verified"] = evidence_verified
    
    return result

In [None]:
# 12. Main Extraction Loop

def run_extraction(
    output_file=OUTPUT_FILE,
    limit=MAX_DOCUMENTS,
    batch_size=BATCH_SIZE,
):
    """
    Main extraction loop with resume support and batch checkpointing.
    """
    print(f"--- COMBINED EXTRACTION: {limit} documents ---")
    
    # 1. Resume support: Load existing progress
    if os.path.exists(output_file):
        df_results = pd.read_csv(output_file)
        processed_files = set(df_results["filename"].tolist())
        print(f"Resuming: {len(processed_files)} documents already processed.")
    else:
        df_results = pd.DataFrame()
        processed_files = set()
    
    # 2. Load dataset
    dataset = load_edgar_dataset()
    
    current_batch = []
    total_processed = len(processed_files)
    new_processed = 0
    
    # 3. Main loop
    for doc in tqdm(dataset, desc="Extracting", total=limit):
        fname = doc.get("filename")
        
        # Skip if already processed
        if fname in processed_files:
            continue
        
        # Limit check
        if total_processed + new_processed >= limit:
            break
        
        # Process document
        result = process_document(doc, llm, SAMPLING_PARAMS)
        current_batch.append(result)
        new_processed += 1
        
        # Batch checkpoint
        if len(current_batch) >= batch_size:
            df_batch = pd.DataFrame(current_batch)
            df_results = pd.concat([df_results, df_batch], ignore_index=True)
            df_results.to_csv(output_file, index=False)
            current_batch = []
            
            # Memory cleanup
            gc.collect()
            torch.cuda.empty_cache()
            print(f"  [Checkpoint] Saved {total_processed + new_processed}/{limit} docs.")
    
    # 4. Final save
    if current_batch:
        df_batch = pd.DataFrame(current_batch)
        df_results = pd.concat([df_results, df_batch], ignore_index=True)
        df_results.to_csv(output_file, index=False)
    
    print(f"--- EXTRACTION COMPLETE: {output_file} ---")
    return df_results

In [None]:
# 13. Run Extraction

df_extracted = run_extraction(
    output_file=OUTPUT_FILE,
    limit=MAX_DOCUMENTS,
    batch_size=BATCH_SIZE,
)

# Display sample results
print(f"\nTotal rows: {len(df_extracted)}")
display(df_extracted.head())

In [None]:
# 14. Quality Report

def generate_quality_report(df):
    """Generate a summary of extraction quality."""
    print("\n=== EXTRACTION QUALITY REPORT ===")
    print(f"Total documents: {len(df)}")
    print()
    
    for q in QUESTION_BANK:
        field_id = q["id"]
        value_col = f"{field_id}_value"
        verified_col = f"{field_id}_evidence_verified"
        
        if value_col not in df.columns:
            continue
        
        total = len(df)
        found = len(df[~df[value_col].isin(["NOT_FOUND", "NO_CONTENT", "JSON_PARSE_ERROR"])])
        if verified_col in df.columns:
            verified = df[verified_col].sum() if df[verified_col].dtype == bool else 0
        else:
            verified = "N/A"
        
        print(f"{field_id}:")
        print(f"  Found: {found}/{total} ({100*found/total:.1f}%)")
        print(f"  Evidence Verified: {verified}")
        print()

if not df_extracted.empty:
    generate_quality_report(df_extracted)

In [None]:
# 15. Inspect Specific Results (by row number)

ROW_NUMBER = 0  # Change this to inspect different rows

if not df_extracted.empty and ROW_NUMBER < len(df_extracted):
    row = df_extracted.iloc[ROW_NUMBER]
    
    print(f"=== ROW {ROW_NUMBER}: {row['filename']} ===")
    print(f"CIK: {row['cik']} | Year: {row['year']}")
    print()
    
    for q in QUESTION_BANK:
        field_id = q["id"]
        print(f"--- {field_id} ---")
        print(f"  Value: {row[f'{field_id}_value']}")
        print(f"  Evidence: {row[f'{field_id}_evidence']}")
        print(f"  Verified: {row[f'{field_id}_evidence_verified']}")
        print()
else:
    print(f"Row {ROW_NUMBER} not found. DataFrame has {len(df_extracted)} rows.")# 15. Inspect Specific Results (Optional)



In [None]:

ROW_TO_INSPECT = 0  # Change this to view different rows
if not df_extracted.empty and ROW_TO_INSPECT < len(df_extracted):
    row = df_extracted.iloc[ROW_TO_INSPECT]
    
    print(f"=== FULL TEXT: {row['filename']} ===")
    print(f"Length: {len(row['full_text'])} characters")
    print("=" * 60)
    print(row['full_text'])
else:
    print(f"Row {ROW_TO_INSPECT} not found.")