# EDGAR Ground Truth Extraction Tournament

This notebook compares three strategies for extracting data from SEC 10-K filings:

1. **Pure Regex**: Fast, baseline method using regular expressions.
2. **Hybrid (Smart Locator + LLM)**: Uses **Strict Regex Anchors** first to locate the exact paragraph. If that fails, it falls back to **Keywords**. Then it asks Qwen 2.5 to extract the final answer from that zoomed-in context.
3. **Pure LLM**: Feeds the entire section (truncated to fit context) to the LLM.

### Goal
Determine which method provides the best trade-off between accuracy and cost/speed.

In [None]:
# 1. Installation & Setup
# Uncomment the line below if you need to install these libraries
#!pip install -q torch transformers datasets pandas tqdm accelerate bitsandbytes

In [None]:
# 2. Import installed libraries
import os
import re
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

# Configure display to show full text in pandas
pd.set_option('display.max_colwidth', None)

print("Setup Complete.")

In [None]:
# 3. Load Model (Qwen 2.5-7B-Instruct)
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, logging

# Quiet HF logs
logging.set_verbosity_error()

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
DTYPE = torch.bfloat16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading model...")

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=DTYPE,
    device_map="cuda" if DEVICE == "cuda" else None,
    trust_remote_code=True,
)
model.eval()

print("Loaded:", MODEL_NAME, "| device:", DEVICE, "| PID:", os.getpid())

In [None]:
# 4. Unified Configuration (Question Bank)
# UPGRADED: Now uses 'regex_anchors' (List) from compare_extractions.py for smarter location.

QUESTION_BANK = {
    "section_1": [
        {
            "id": "incorporation_state",
            "prompt": "In which U.S. state was this company incorporated? Answer with ONLY the state name. If not found, answer NULL.",
            # Priority 1: Strict Regex Patterns (from compare_extractions.py)
            "regex_anchors": [
                r"(?i)incorporated (?:in|under the laws of) (?:the state of )?(\w+(?:\s+\w+)?)",
                r"(?i)organized (?:in|under the laws of) (?:the state of )?(\w+(?:\s+\w+)?)",
                r"(?i)a (\w+(?:\s+\w+)?) corporation",  # Matches "a Delaware corporation"
                r"(?i)state of incorporation[:\s]+(\w+(?:\s+\w+)?)",
            ],
            # Priority 2: Fallback Generic Keywords
            "fallback_keywords": [
                "incorporated",
                "organized under",
                "laws of the state",
                "formed under",
            ],
        },
        {
            "id": "incorporation_year",
            "prompt": "In what year was this company incorporated? Answer with ONLY the year. If not found, answer NULL.",
            "regex_anchors": [
                r"(?i)(?:incorporated|organized|founded|established|formed) (?:in |on |)(?:\w+ )?(19\d{2}|20\d{2})",
                r"(?i)since (19\d{2}|20\d{2})",
            ],
            "fallback_keywords": [
                "incorporated",
                "founded",
                "organized",
                "formed",
                "year",
            ],
        },
        {
            "id": "employee_count",
            "prompt": "How many full-time employees does the company have? Answer with ONLY the number. If not found, answer NULL.",
            "regex_anchors": [
                r"(?i)(?:approximately|approx\.|had|total of|employ)\s+([0-9,]+)(?:\s+full-time)?\s+employees",
                r"(?i)employees.*?([0-9,]+)",
            ],
            "fallback_keywords": [
                "employees",
                "full-time",
                "employed",
                "workforce",
                "persons",
            ],
        },
        {
            "id": "fiscal_year_end",
            "prompt": "On what date does the company's fiscal year end? Answer with Month and Day (e.g., 'December 31'). If not found, answer NULL.",
            "regex_anchors": [
                r"(?i)fiscal year end(?:ed|s)(?:\s+on)?\s+([A-Z][a-z]+ \d{1,2})"
            ],
            "fallback_keywords": [
                "fiscal year end",
                "fiscal year ends",
                "fiscal year ended",
            ],
        },
        {
            "id": "company_product",
            "prompt": "What is the main product, service, or business activity of this company? Answer in 2-5 words. If not found, answer NULL.",
            "regex_anchors": [
                r"(?i)engaged in the (?:business|manufacture|sale|development) of ([^.;]+)"
            ],
            "fallback_keywords": [
                "engaged in",
                "business of",
                "manufacture",
                "sale of",
                "products",
            ],
        },
    ],
    "section_2": [
        {
            "id": "headquarters_state",
            "prompt": "In which U.S. state are the company's principal executive offices located? Answer with ONLY the state name. If not found, answer NULL.",
            "regex_anchors": [
                r"(?i)(?:headquarters|principal (?:executive )?offices?|corporate offices?) (?:is |are |)(?:located |)in ([^,\.\n]+)",
                r"(?i)executive offices.*?,[\s\r\n]+([A-Z][a-z]+(?: [A-Z][a-z]+)*)[\s\r\n]+\d{5}",
            ],
            "fallback_keywords": [
                "executive offices",
                "headquarters",
                "principal offices",
            ],
        }
    ],
    "section_10": [
        {
            "id": "ceo_lastname",
            "prompt": "What is the Last Name of the current CEO? Answer with ONLY the last name. If not found, answer NULL.",
            "regex_anchors": [
                r"(?i)([A-Z][a-z]+ [A-Z][a-z]+)[,\s]+(?:is |serves as |)(?:the |our |)(?:Chief Executive Officer|CEO)",
                r"(?i)(?:Chief Executive Officer|CEO)[:\s]+([A-Z][a-z]+ [A-Z][a-z]+)",
            ],
            "fallback_keywords": ["chief executive officer", "ceo", "serves as"],
        }
    ],
}

In [None]:
# 5. New "Smart Context" Logic
# This implements the logic: strict regex match -> zoom out -> fallback keyword -> zoom out

def get_window(text, center_idx, size=1500):
    """Helper to grab a window of text centered around an index."""
    start = max(0, center_idx - (size // 2))
    end = min(len(text), center_idx + (size // 2))
    return text[start:end]

def get_smart_context(full_text, config_question, window_size=1500):
    """
    Priority 1: Check STRICT regex anchors. If found, zoom out around the match.
    Priority 2: Check GENERIC keywords. If found, zoom out around the first hit.
    Fallback: Return start of text.
    """
    if not full_text: return ""
    
    # 1. Priority: Valid Regex Anchors
    for pattern in config_question.get("regex_anchors", []):
        match = re.search(pattern, full_text)
        if match:
            # We found a strict match (e.g. "incorporated in Delaware").
            # Don't just trust the group capture, grab the whole context window so LLM can verify.
            print(f"  [SmartLocator] Found Anchor: {pattern[:30]}...") # Debug log
            return get_window(full_text, match.start(), window_size)
            
    # 2. Priority: Keywords
    text_lower = full_text.lower()
    for keyword in config_question.get("fallback_keywords", []):
        index = text_lower.find(keyword.lower())
        if index != -1:
            # print(f"  [SmartLocator] Fallback Keyword: {kw}")
            return get_window(full_text, index, window_size)
            
    # 3. Fallback to start
    return full_text[:window_size]


def ask_llm(context, prompt, model, tokenizer, max_new_tokens=50):
    """Sends a prompt to the LLM with the given context."""
    if not context or not context.strip():
        return None

    messages = [
        {"role": "system", "content": "You are a precise data extraction assistant."},
        {"role": "user", "content": f"Read this SEC 10-K filing excerpt and answer the question. \nContext: \"{context}\"\n\nQuestion: {prompt} \nIf the information is not present in the context, reply with 'NULL'."}
    ]
    
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    
    # Clean response
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    answer = response.strip().split('\n')[0]
    
    if "null" in answer.lower() or "not found" in answer.lower():
        return None
        
    return answer

In [None]:
# 6. Strategy Implementations


def extract_pure_regex(doc):
    """Strategy 1: Pure Regex Matching (Updated to use List of Anchors)"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        text = doc.get(section, "")
        if not text:
            for q in questions:
                results[q["id"]] = None
            continue

        for q in questions:
            # Try all strict regex patterns in order
            found_val = None
            for pattern in q.get("regex_anchors", []):
                match = re.search(pattern, str(text), re.DOTALL)
                if match:
                    # Found a match, capture group 1
                    found_val = match.group(1).strip(" .,;")[:100]
                    break  # Stop after first match

            results[q["id"]] = found_val

    return results


def extract_hybrid_llm(doc, model, tokenizer):
    """Strategy 2: Hybrid (Smart Locator + LLM Extraction)"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        full_text = doc.get(section, "")

        for q in questions:
            # 1. SMART LOCATE: Try regex anchor -> Zoom -> Fallback Keyword
            context = get_smart_context(full_text, q)

            # 2. Extract with LLM
            answer = ask_llm(context, q["prompt"], model, tokenizer)
            results[q["id"]] = answer

    return results


def extract_pure_llm(doc, model, tokenizer, max_context_chars=12000):
    """Strategy 3: Pure LLM (Feed entire section context)"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        full_text = doc.get(section, "")
        if not full_text:
            for q in questions:
                results[q["id"]] = None
            continue

        # Truncate to fit in context
        context = full_text[:max_context_chars]

        for q in questions:
            answer = ask_llm(context, q["prompt"], model, tokenizer)
            results[q["id"]] = answer

    return results

In [None]:
# 7. The Tournament Loop
# Runs detailed comparison on a small batch of documents

def create_df_tournament(model, tokenizer, TARGET_DOCS=5): 
    print("Loading Data Stream...")
    dataset = load_dataset(
        "c3po-ai/edgar-corpus",
        "default",
        split="train",
        streaming=True,
        revision="refs/convert/parquet",
    )

    comparison_data = []

    print(f"Running Tournament on {TARGET_DOCS} docs...")
    for i, doc in enumerate(dataset):
        if i >= TARGET_DOCS:
            break

        print(f"Processing {doc.get('filename', f'Doc {i}')}...")

        # Strategy 1: Regex (Now Smart Regex)
        res_regex = extract_pure_regex(doc)

        # Strategy 2: Hybrid (Now Smart Context)
        res_hybrid = extract_hybrid_llm(doc, model, tokenizer)

        # Strategy 3: Pure LLM
        res_pure = extract_pure_llm(doc, model, tokenizer)

        # Combine results
        row = {
            "filename": doc.get("filename"),
            "cik": doc.get("cik"),
            "year": doc.get("year"),
        }

        all_keys = list(res_regex.keys())

        for key in all_keys:
            # Shorten key for column name width
            short_key = key.replace("incorporation_", "Inc_").replace(
                "headquarters_", "HQ_"
            )

            val_hybrid = res_hybrid.get(key)
            val_pure = res_pure.get(key)

            row[f"{short_key}_Regex"] = res_regex.get(key)
            row[f"{short_key}_Hybrid"] = val_hybrid
            row[f"{short_key}_LLM"] = val_pure

            # Consensus Check
            if val_hybrid is None and val_pure is None:
                row[f"{short_key}_Match"] = None 
            elif val_hybrid is None or val_pure is None: 
                row[f"{short_key}_Match"] = "0"
            else:
                is_match = val_hybrid.strip().lower() == val_pure.strip().lower()
                row[f"{short_key}_Match"] = "1" if is_match else "0"

        comparison_data.append(row)

    df_tournament = pd.DataFrame(comparison_data)
    print("\n--- TOURNAMENT RESULTS ---")  
    display(df_tournament)
    df_tournament.fillna("NULL", inplace=True)
    return df_tournament

In [None]:
# 8. Run the tournament and save the results to a CSV file. 

# sample so only do 5 
df_sample = create_df_tournament(model, tokenizer)   
# Feel free to add code blocks below to analyze the results

In [None]:
# 9. If fine create sample doc csv
df_sample.to_csv("edgar_tournament_sample.csv", index=False) 

In [None]:
# 10. If fine create full doc csv 
df_full = create_df_tournament(model, tokenizer, 250)
df_full.to_csv("edgar_tournament_full.csv", index=False)

In [None]:
import math

# 1. Identify the columns you want to KEEP
# We keep a column ONLY if it does NOT contain "Match", "filename", "year", or "cik"
exclude_terms = ["Match", "filename", "year", "cik"]
cols_to_keep = [
    col for col in df_full.columns if not any(term in col for term in exclude_terms)
]

# 2. Create a subset (this does not change df_full)
subset = df_full[cols_to_keep]

# 3. Calculate stats on the subset
# Compare entire subset to the string 'NULL'
null_counts = (subset == "NULL").sum()

# Calculate percentage based on total rows
null_percentages = (null_counts / len(df_full) * 100).apply(math.floor)

# 4. Create the final result table
summary_df = pd.DataFrame({"Null Counts": null_counts, "Null %": null_percentages})

# Filter out columns with 0 nulls if you only want to see problems
# summary_df = summary_df[summary_df['Null Counts'] > 0]

print(summary_df)