# EDGAR Ground Truth Extraction Tournament

This notebook compares three strategies for extracting data from SEC 10-K filings:

1. **Pure Regex**: Fast, baseline method using regular expressions.
2. **Hybrid (Smart Locator + LLM)**: Uses **Strict Regex Anchors** first to locate the exact paragraph. If that fails, it falls back to **Keywords**. Then it asks Qwen 2.5 to extract the final answer from that zoomed-in context.
3. **Pure LLM**: Feeds the entire section (truncated to fit context) to the LLM.

### Goal
Determine which method provides the best trade-off between accuracy and cost/speed.

In [None]:
# 1. Installation & Setup
# Uncomment the line below if you need to install these libraries
#!pip install -q torch transformers datasets pandas tqdm accelerate bitsandbytes

In [None]:
# 2. Import installed libraries
import os
import re
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

# Configure display to show full text in pandas
pd.set_option('display.max_colwidth', None)

print("Setup Complete.")

In [None]:
# 3. Load Model (Qwen 2.5-7B-Instruct)
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, logging

# Quiet HF logs
logging.set_verbosity_error()

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
DTYPE = torch.bfloat16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading model...")

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=DTYPE,
    device_map="cuda" if DEVICE == "cuda" else None,
    trust_remote_code=True,
)
model.eval()

print("Loaded:", MODEL_NAME, "| device:", DEVICE, "| PID:", os.getpid())

In [None]:
# 4. Unified Configuration (Question Bank)
# UPGRADED: Now uses 'regex_anchors' (List) from compare_extractions.py for smarter location.

QUESTION_BANK = {
    "section_1": [
        {
            "id": "company_name",
            # PROMPT: Targets the specific filing entity
            "prompt": (
                "What is the exact legal name of the registrant? "
                "1. Look for the very first sentence of the 'Business' section or the cover page intro "
                "(e.g., 'Apple Inc. (the Registrant)...'). "
                "2. Do NOT use 'Doing Business As' (DBA) names or brand names. "
                "3. Do NOT include the stock ticker symbol. "
                "4. Include legal suffixes like 'Inc.', 'Corp.', 'Ltd.' if present. "
                "Answer with ONLY the legal name string."
            ),
            # REGEX: Finds the intro paragraph
            "regex_anchors": [
                r"([A-Z0-9][\w\s.,&'-]+?)\s*\(?(?:the\s+)?(?:Company|Registrant)\b",
                r"([A-Z0-9][\w\s.,&'-]+?),\s+a\s+\w+\s+corporation",
            ],
            "fallback_keywords": [
                "incorporated",
                "organized",
                "the company",
                "registrant",
            ],
        },
        {
            "id": "headquarters_city",
            "prompt": (
                "In which city are the registrant‚Äôs *principal executive offices* physically located? "
                "1. Look for the address under 'Executive Offices' or 'Address of Principal Executive Offices'. "
                "2. CRITICAL WARNING: Do NOT return the city of the 'Registered Agent' or 'State of Incorporation' "
                "(e.g., ignore 'Wilmington' or 'Dover' unless the CEO actually works there). "
                "3. Ignore P.O. Boxes. "
                "Answer with ONLY the city name."
            ),
            "regex_anchors": [
                r"(?i)executive\s+offices.*?(?:located|address).*?[\r\n]+.*?,?\s*([A-Z][a-z]+(?: [A-Z][a-z]+)*),?\s+[A-Z]{2}\s+\d{5}",
                r"(?i)located\s+at\s+.*?,?\s*([A-Z][a-z]+(?: [A-Z][a-z]+)*),?\s+[A-Z]{2}\s+\d{5}",
            ],
            "fallback_keywords": ["executive offices", "located at", "address"],
        },
        # --- STATE: Hybrid Logic (Successor=New, Reinc=Old) ---
        {
            "id": "original_incorporation_state",
            # PROMPT: Explicitly handles the Monsanto vs Hexcel conflict
            "prompt": (
                "In which U.S. state was the registrant *originally* incorporated or organized? "
                "Follow this strict hierarchy: "
                "1. PRIORITIZE HISTORY: Look for phrases like 'originally incorporated in', 'formerly organized in', "
                "or 'predecessor company incorporated in'. "
                "2. REINCORPORATION RULE: If the company reincorporated (e.g., moved from California to Delaware), "
                "you MUST return the OLD state (California), not the current one. "
                "3. MERGER EXCEPTION: Only if the registrant is a *new* successor entity formed by a merger, "
                "return the state of that successor. "
                "4. If no history is mentioned, return the current state. "
                "Answer with ONLY the state name."
            ),
            # REGEX: Casts a wide net to find any mention of inc, org, or predecessors
            "regex_anchors": [
                r"(?i)incorporated (?:in|under the laws of) (?:the state of )?(\w+(?:\s+\w+)?)",
                r"(?i)organized (?:in|under the laws of) (?:the state of )?(\w+(?:\s+\w+)?)",
                r"(?i)a (\w+(?:\s+\w+)?) corporation",
                r"(?i)state of incorporation[:\s]+(\w+(?:\s+\w+)?)",
                r"(?i)originally\s+(?:incorporated|organized).*?in\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
                r"(?i)predecessor.*?incorporated.*?in\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
            ],
            "fallback_keywords": [
                "incorporated",
                "organized",
                "originally",
                "predecessor",
                "laws of the state",
            ],
        },
        # --- YEAR: Hybrid Logic (Successor=New, Reinc=Old) ---
        {
            "id": "original_incorporation_year",
            # PROMPT: Explicitly handles the Monsanto vs Hexcel conflict
            "prompt": (
                "In which year was the registrant *originally* incorporated or organized? "
                "1. IGNORE 'FOUNDED' dates. Only look for 'incorporated', 'organized', or 'formed'. "
                "2. REINCORPORATION RULE: If the text says 'originally incorporated in 1980' and 'reincorporated in 1995', "
                "return the EARLIEST year (1980). "
                "3. MERGER EXCEPTION: If the current entity was formed by a merger of equals, use the year of that merger. "
                "Answer with ONLY the year (YYYY)."
            ),
            "regex_anchors": [
                r"(?i)(?:incorporated|organized|founded|established|formed) (?:in |on |)(?:\w+ )?(18\d{2}|19\d{2}|20\d{2})",
                r"(?i)originally\s+incorporated.*?(18\d{2}|19\d{2}|20\d{2})",
            ],
            "fallback_keywords": [
                "founded",
                "incorporated",
                "organized",
                "year",
                "originally",
            ],
        },
        # --- EMPLOYEES: Exclusion Logic ---
        {
            "id": "employee_count",
            "prompt": (
                "What is the total number of employees the registrant has? "
                "1. PREFER FULL-TIME: If the text distinguishes between full-time and part-time, return the full-time count. "
                "2. If only 'total' is given, use that. "
                "3. EXCLUDE: Do not count independent contractors, agents, or temporary staff unless they are the only number given. "
                "4. FORMAT: Remove commas and return ONLY the integer (e.g., return 14500, not 14,500). "
                "If the number is 'approximately 5,000', return 5000."
            ),
            "regex_anchors": [
                r"(?i)(?:had|employ).*?([0-9,]+)\s+(?:full-time)?\s+employees",
                r"(?i)(?:had|employ(?:ed|s)?|totaling)\s+(?:approximately|over|roughly|about\s+)?([0-9,]+)\s+(?:full-time|total)?\s+employees",
                r"(?i)([0-9,]+)\s+(?:full-time\s+)?(?:people|persons|employees)\s+(?:were|are)\s+employed",
            ],
            "fallback_keywords": ["employees", "full-time"],
        },
        {
            "id": "headquarters_state",
            "prompt": (
                "In which U.S. state are the registrant‚Äôs *principal executive offices* physically located? "
                "1. This is the state where the HQ building is, NOT necessarily the state of incorporation. "
                "2. CRITICAL: If the text says 'Incorporated in Delaware' but 'Executive offices in California', "
                "return CALIFORNIA. "
                "Answer with ONLY the state name."
            ),
            "regex_anchors": [
                r"(?i)(?:headquarters|principal (?:executive )?offices?|corporate offices?) (?:is |are |)(?:located |)in ([^,\.\n]+)",
                r"(?i)executive offices.*?,[\s\r\n]+([A-Z][a-z]+(?: [A-Z][a-z]+)*)[\s\r\n]+\d{5}",
            ],
            "fallback_keywords": [
                "executive offices",
                "headquarters",
                "principal offices",
            ],
        },
    ],
    "section_10": [
        {
            "id": "ceo_lastname",
            "What is the LAST NAME of the registrant‚Äôs current Chief Executive Officer (CEO)? "
                "1. Look for 'Chief Executive Officer', 'CEO', or 'Principal Executive Officer'. "
                "2. If 'Co-CEOs' are listed, pick the first one mentioned. "
                "3. EXCLUDE titles (Mr., Dr.) and first/middle names. "
                "4. If the CEO has a compound last name (e.g., 'Von Trap'), include the full last name. "
                "Answer with ONLY the last name string."
            ),
            "regex_anchors": [
                r"(?i)([A-Z][a-z]+ [A-Z][a-z]+)[,\s]+(?:is |serves as |)(?:the |our |)(?:Chief Executive Officer|CEO)",
                r"(?i)(?:Chief Executive Officer|CEO)[:\s]+([A-Z][a-z]+ [A-Z][a-z]+)",
            ],
            "fallback_keywords": ["chief executive officer", "ceo", "serves as"],
        }
    ],
}

In [None]:
# 5. New "Smart Context" Logic
# This implements the logic: strict regex match -> zoom out -> fallback keyword -> zoom out


def merge_ranges(ranges):
    """
    Merges overlapping or adjacent integer ranges.
    Input: [(0, 100), (50, 150), (300, 400)]
    Output: [(0, 150), (300, 400)]
    """
    if not ranges:
        return []
    # Sort by start time
    ranges.sort(key=lambda x: x[0])

    merged = [ranges[0]]
    for current in ranges[1:]:
        last = merged[-1]
        # If overlap or adjacent, merge
        if current[0] <= last[1]:
            new_end = max(last[1], current[1])
            merged[-1] = (last[0], new_end)
        else:
            merged.append(current)
    return merged


def get_smart_context(full_text, config_question, window_size=1500):
    """
    Priority 1: Check STRICT regex anchors. If found, zoom out around the match and append to list.
    Priority 2: Check GENERIC keywords. If found, zoom out around the first hit.
    Fallback: Return full text.
    """
    if not full_text:
        return ""
    found_ranges = []
    # Anchors first
    for pattern in config_question.get("regex_anchors", []):
        # findinter gives us all matches
        matches = re.finditer(pattern, full_text)
        matched = False
        for match in matches:
            if not matched:
                matched = True
                print(f"  [SmartLocator] Found Anchor: {pattern[:30]}...")
            start = max(0, match.start() - (window_size // 2))
            end = min(len(full_text), match.end() + (window_size // 2))
            found_ranges.append((start, end)) 
    
    if found_ranges: 
        merged = merge_ranges(found_ranges)  
        context_parts = []
        for r in merged:
            context_parts.append(full_text[r[0]:r[1]])
            
        print(f"  [SmartLocator] Found {len(found_ranges)} matches, merged into {len(merged)} windows.")
        return "\n\n...[SECTION BREAK]...\n\n".join(context_parts)
    
    # Keywords if no anchors found
    text_lower = full_text.lower()

    for keyword in config_question.get("fallback_keywords", []):
        index = text_lower.find(keyword.lower())
        if index != -1:
            print(f"  [SmartLocator] Fallback Keyword: {keyword}")
            start = max(0, index - (window_size // 2))
            end = min(len(full_text), index + (window_size // 2))
            return full_text[start:end]

    # 3. Fallback to full_text
    return full_text


def ask_llm(context, prompt, model, tokenizer, max_new_tokens=50):
    """Sends a prompt to the LLM with the given context."""
    if not context or not context.strip():
        return None

    model_limit = getattr(tokenizer, "model_max_length", 32768) 
    print(f"  [LLM] Model limit: {model_limit}")
    available_for_context = model_limit - 650
    char_limit = available_for_context * 4

    if len(context) > char_limit:
        context = context[:char_limit]

    messages = [
        {"role": "system", "content": "You are a precise data extraction assistant."},
        {
            "role": "user",
            "content": f"Read this SEC 10-K filing excerpt and answer the question. \nContext: \"{context}\"\n\nQuestion: {prompt} \nIf the information is not present in the context, reply with 'NULL'.",
        },
    ]

    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.0
        )

    # Clean response
    response = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
    )
    answer = response.strip().split("\n")[0]

    if "null" in answer.lower() or "not found" in answer.lower():
        return None

    return answer

In [None]:
# 6. Strategy Implementations


def extract_pure_regex(doc):
    """Strategy 1: Pure Regex Matching (Updated to use List of Anchors)"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        text = doc.get(section, "")
        if not text:
            for q in questions:
                results[q["id"]] = None
            continue

        for q in questions:
            # Try all strict regex patterns in order
            found_val = None
            for pattern in q.get("regex_anchors", []):
                match = re.search(pattern, str(text), re.DOTALL)
                if match:
                    # Found a match, capture group 1
                    found_val = match.group(1).strip(" .,;")[:100]
                    break  # Stop after first match

            results[q["id"]] = found_val

    return results


def extract_hybrid_llm(doc, model, tokenizer):
    """Strategy 2: Hybrid (Smart Locator + LLM Extraction)"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        full_text = doc.get(section, "")

        for q in questions:
            # 1. SMART LOCATE: Try regex anchor -> Zoom -> Fallback Keyword
            context = get_smart_context(full_text, q)

            # 2. Extract with LLM
            answer = ask_llm(context, q["prompt"], model, tokenizer)
            results[q["id"]] = answer

    return results


def extract_pure_llm(doc, model, tokenizer):
    """Strategy 3: Pure LLM (Feed entire section context)"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        full_text = doc.get(section, "")
        if not full_text:
            for q in questions:
                results[q["id"]] = None
            continue
        for q in questions:
            answer = ask_llm(full_text, q["prompt"], model, tokenizer)
            results[q["id"]] = answer

    return results

In [None]:
# 7. The Tournament Loop
# Runs detailed comparison on a small batch of documents

def create_df_tournament(model, tokenizer, TARGET_DOCS=5): 
    print("Loading Data Stream...")
    dataset = load_dataset(
        "c3po-ai/edgar-corpus",
        "default",
        split="train",
        streaming=True,
        revision="refs/convert/parquet",
    )

    comparison_data = []

    print(f"Running Tournament on {TARGET_DOCS} docs...")
    for i, doc in enumerate(dataset):
        if i >= TARGET_DOCS:
            break

        print(f"Processing {doc.get('filename', f'Doc {i}')}...")

        # Strategy 1: Regex (Now Smart Regex)
        res_regex = extract_pure_regex(doc)

        # Strategy 2: Hybrid (Now Smart Context)
        res_hybrid = extract_hybrid_llm(doc, model, tokenizer)

        # Strategy 3: Pure LLM
        res_pure = extract_pure_llm(doc, model, tokenizer)

        # Combine results
        row = {
            "filename": doc.get("filename"),
            "cik": doc.get("cik"),
            "year": doc.get("year"),
        }

        all_keys = list(res_regex.keys())

        for key in all_keys:
            # Shorten key for column name width
            short_key = key.replace("incorporation_", "Inc_").replace(
                "headquarters_", "HQ_"
            )

            val_hybrid = res_hybrid.get(key)
            val_pure = res_pure.get(key)

            row[f"{short_key}_Regex"] = res_regex.get(key)
            row[f"{short_key}_Hybrid"] = val_hybrid
            row[f"{short_key}_LLM"] = val_pure

            # Consensus Check
            if val_hybrid is None and val_pure is None:
                row[f"{short_key}_Match"] = None 
            elif val_hybrid is None or val_pure is None: 
                row[f"{short_key}_Match"] = "0"
            else:
                is_match = val_hybrid.strip().lower() == val_pure.strip().lower()
                row[f"{short_key}_Match"] = "1" if is_match else "0"

        comparison_data.append(row)

    df_tournament = pd.DataFrame(comparison_data)
    print("\n--- TOURNAMENT RESULTS ---")  
    display(df_tournament)
    df_tournament.fillna("NULL", inplace=True)
    return df_tournament

In [None]:
# 8. Run the tournament and save the results to a CSV file. 

# sample so only do 5 
df_sample = create_df_tournament(model, tokenizer)   
# Feel free to add code blocks below to analyze the results

In [None]:
# 9. If fine create sample doc csv
df_sample.to_csv("edgar_tournament_sample.csv", index=False) 

In [None]:
# 10. If fine create full doc csv 
df_full = create_df_tournament(model, tokenizer, 250)
df_full.to_csv("edgar_tournament_full.csv", index=False) 
df_results = df_full

# Comparing Different Extraction Approaches With Ground Truth 

In [None]:
import math

# 1. Identify the columns you want to KEEP
# We keep a column ONLY if it does NOT contain "Match", "filename", "year", or "cik"
exclude_terms = ["Match", "filename", "year", "cik"]
cols_to_keep = [
    col for col in df_full.columns if not any(term in col for term in exclude_terms)
]

# 2. Create a subset (this does not change df_full)
subset = df_full[cols_to_keep]

# 3. Calculate stats on the subset
# Compare entire subset to the string 'NULL'
null_counts = (subset == "NULL").sum()

# Calculate percentage based on total rows
null_percentages = (null_counts / len(df_full) * 100).apply(math.floor)

# 4. Create the final result table
summary_df = pd.DataFrame({"Null Counts": null_counts, "Null %": null_percentages})

# Filter out columns with 0 nulls if you only want to see problems
# summary_df = summary_df[summary_df['Null Counts'] > 0]

print(summary_df)

In [None]:
# Use this if you have a csv that has the results of your different approaches load them here too through this function 
df_results = pd.read_csv("filename here")

In [None]:
# Load in ground truth data and compare with different approaches
ground_truth_file = "" # put file name here 
df_truth = pd.read_csv(ground_truth_file)  

In [None]:
import numpy as np

# --- Helper Functions ---
def normalize_text(val):
    if pd.isna(val) or str(val).lower() in ['null', 'nan', 'none', '']:
        return "NULL"
    return str(val).strip().lower().replace('.', '').replace(',', '')

def normalize_year(val):
    try:
        if pd.isna(val) or str(val).lower() in ['null', 'nan', '']:
            return "NULL"
        return str(int(float(val)))
    except:
        return "NULL" 

def normalize_count(val):
    """
    Cleans numbers: removes commas, handles floats, enforces Integer format.
    Example: "4,000" -> "4000",  465.0 -> "465"
    """
    try:
        if pd.isna(val) or str(val).lower() in ['null', 'nan', 'none', '']:
            return "NULL"
        
        # Remove commas first (e.g. "5,317" -> "5317")
        clean_string = str(val).replace(',', '').strip()
        
        # Convert to float first (handles "465.0"), then int, then string
        return str(int(float(clean_string)))
    except:
        return "NULL"

In [None]:
def compare_approaches(df_truth, df_approach, comparison_map):
    gt = df_truth.copy()
    res = df_approach.copy()

    if "filename" in gt.columns:
        gt.set_index("filename", inplace=True)
    if "filename" in res.columns:
        res.set_index("filename", inplace=True)
    common_files = gt.index.intersection(res.index)
    if len(common_files) == 0:
        print("No matching files found between Truth and Results!")
        return

    gt = gt.loc[common_files]
    res = res.loc[common_files]

    print(f"--- üèÜ TOURNAMENT RESULTS ({len(common_files)} docs) ---")

    # 2. Loop through the Map
    for truth_col, pred_cols_list in comparison_map.items():

        if truth_col not in gt.columns:
            print(f"\n[SKIP] Ground Truth column '{truth_col}' not found.")
            continue

        print(f"\nEvaluating Feature: {truth_col}")
        print(f"{'Method':<25} | {'Accuracy':<10} | {'Correct/Total'}")
        print("-" * 55)

        # Select Cleaner Logic
        cleaner = normalize_text
        if "year" in truth_col.lower():
            cleaner = normalize_year
        if "count" in truth_col.lower() or "employee" in truth_col.lower():
            cleaner = normalize_count

        # Normalize Truth once
        y_true = gt[truth_col].apply(cleaner)

        # 3. Compare each Approach against this Truth
        scores = []
        for pred_col in pred_cols_list:
            if pred_col not in res.columns:
                print(f"{pred_col:<25} | [MISSING COLUMN]")
                continue

            y_pred = res[pred_col].apply(cleaner)

            # Calculate Score
            matches = y_true == y_pred
            acc = matches.mean()
            correct = matches.sum()

            print(f"{pred_col:<25} | {acc:.2%}    | {correct}/{len(matches)}")

            scores.append((pred_col, acc))

        # Optional: Print Winner of this round
        if scores:
            best_method, best_score = max(scores, key=lambda x: x[1])
            print(f" >> WINNER: {best_method} ({best_score:.2%})")

    print("\n" + "=" * 55)

In [None]:
tournament_map = {
    # Truth Column                : [List of Approaches to Test]
    'original_inc_state_truth':   ['Inc_state_Regex', 'Inc_state_Hybrid', 'Inc_state_LLM'],
    'original_inc_year_truth':    ['Inc_year_Regex', 'Inc_year_Hybrid', 'Inc_year_LLM'],
    'employee_count_truth':       ['employee_count_Regex', 'employee_count_Hybrid', 'employee_count_LLM']
} 

compare_approaches(df_truth, df_results,tournament_map)