# EDGAR Ground Truth Extraction Tournament

This notebook compares three strategies for extracting data from SEC 10-K filings:

1. **Pure Regex**: Fast, baseline method using regular expressions.
2. **Hybrid (Locator + LLM)**: Uses keywords/lax regex to find the relevant context, then asks an LLM (Qwen 2.5) to extract the exact answer.
3. **Pure LLM**: Feeds the entire section (truncated to fit context) to the LLM.

### Goal
Determine which method provides the best trade-off between accuracy and cost/speed.

In [None]:
# 1. Installation & Setup
# Uncomment the line below if you need to install these libraries
# !pip install -q torch transformers datasets pandas tqdm accelerate bitsandbytes

In [None]:

# 2. Import installed libraries

import os
import re
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

# Configure display to show full text in pandas
pd.set_option('display.max_colwidth', None)

print("Setup Complete.")

In [None]:
# 3. Load Model (Qwen 2.5-7B-Instruct)

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

try:
    print(f"Loading {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype="auto",
        device_map="auto"
    )
    print("Model Loaded Successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Make sure you have a GPU enabled and the libraries installed.")

In [None]:
# 4. Unified Configuration (Question Bank)
# Contains logic for ALL strategies: Regex Patterns, Locator Keywords, and LLM Prompts.

QUESTION_BANK = {
    "section_1": [
        {
            "id": "incorporation_state",
            "prompt": "In which U.S. state was this company incorporated? Answer with ONLY the state name.",
            # Regex: Matches "Incorporated in the State of Delaware"
            "extract_regex": r"(?i)(?:[Ii]ncorporated|[Oo]rganized)(?: (?:under the laws of|in))? (?:the [Ss]tate of\s*)?([A-Z][a-z]+(?: [A-Z][a-z]+)*)",
            # Hybrid Locator: Keywords to find the paragraph
            "keywords": ["incorporated", "organized under", "laws of the state", "formed under"]
        },
        {
            "id": "incorporation_year",
            "prompt": "In what year was this company incorporated? Answer with ONLY the year.",
            # Regex: Matches "incorporated ... in 1985"
            "extract_regex": r"(?i)(?:incorporated|founded|organized).*?in (19\d{2}|20\d{2})",
            "keywords": ["incorporated", "founded", "organized", "formed", "year"]
        },
        {
            "id": "employee_count",
            "prompt": "How many full-time employees does the company have? Answer with ONLY the number.",
            # Regex: Matches "approximately 5,000 employees"
            "extract_regex": r"(?i)(?:approximately|approx\.|had|total of|employ)\s+([0-9,]+)(?:\s+full-time)?\s+employees",
            "keywords": ["employees", "full-time", "employed", "workforce", "persons"]
        },
        {
            "id": "fiscal_year_end",
            "prompt": "On what date does the company's fiscal year end? Answer with Month and Day (e.g., 'December 31').",
            # Regex: Matches "fiscal year ends December 31"
            "extract_regex": r"(?i)fiscal year end(?:ed|s)(?:\s+on)?\s+([A-Z][a-z]+ \d{1,2})",
            "keywords": ["fiscal year end", "fiscal year ends", "fiscal year ended"]
        },
        {
            "id": "company_product",
            "prompt": "What is the main product, service, or business activity of this company? Answer in 2-5 words.",
            # Regex: Matches "engaged in the business of..."
            "extract_regex": r"(?i)engaged in the (?:business|manufacture|sale|development) of ([^.;]+)",
            "keywords": ["engaged in", "business of", "manufacture", "sale of", "products"]
        }
    ],
    "section_2": [
         {
            "id": "headquarters_state",
            "prompt": "In which U.S. state are the company's principal executive offices located? Answer with ONLY the state name.",
            # Regex: Matches "executive offices ... [State] [Zip]"
            "extract_regex": r"(?i)executive offices.*?,[\s\r\n]+([A-Z][a-z]+(?: [A-Z][a-z]+)*)[\s\r\n]+\d{5}",
            "keywords": ["executive offices", "headquarters", "principal offices"]
        }
    ],
    "section_10": [
       {
            "id": "ceo_lastname",
            "prompt": "What is the Last Name of the current CEO? Answer with ONLY the last name.",
            # Regex: Matches "Mr. Smith ... CEO"
            "extract_regex": r"(?i)(?:Mr\.|Ms\.|Mrs\.|Dr\.)\s+([A-Z][a-z]+).*?(?:Chief Executive Officer|CEO)",
            "keywords": ["chief executive officer", "ceo", "serves as"]
        },
        {
            "id": "ceo_fullname_backup",
            "prompt": "What is the Full Name of the current CEO? Answer with ONLY the full name.",
             # Regex: Matches Name followed by CEO title
            "extract_regex": r"(?m)([A-Z][a-z]+ [A-Z]\.? [A-Z][a-z]+|[A-Z][a-z]+ [A-Z][a-z]+)\s*,?\s*(?:Chief Executive Officer|CEO)",
             "keywords": ["chief executive officer", "ceo"]
        }
    ]
}

In [None]:
# 5. Helper Functions (LLM & Context)


def get_focused_context(full_text, keywords, window_size=1500):
    """HYBRID: Finds the best keyword match and grabs a context window around it."""
    if not full_text:
        return ""
    text_lower = full_text.lower()

    best_word_index = -1
    # Simple heuristic: find first occurrence of any keyword
    # (Could be improved to find 'densest' keyword region)
    for word in keywords:
        word_index = text_lower.find(word.lower())
        if word_index != -1:
            best_word_index = word_index
            break

    if best_word_index != -1:
        start = max(0, best_word_index - (window_size // 2))
        end = min(len(full_text), best_word_index + (window_size // 2))
        return full_text[start:end]

    # Fallback: Return start of text if no keywords found
    return full_text[:window_size]


def ask_llm(context, prompt, model, tokenizer, max_new_tokens=50):
    """Sends a prompt to the LLM with the given context."""
    if not context or not context.strip():
        return None

    messages = [
        {"role": "system", "content": "You are a precise data extraction assistant."},
        {
            "role": "user",
            "content": f"Read this SEC 10-K filing excerpt and answer the question. \nContext: \"{context}\"\n\nQuestion: {prompt} \nIf the information is not present in the context, reply with 'NULL'.",
        },
    ]

    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=max_new_tokens, do_sample=False
        )

    response = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
    )
    return response.strip().split("\n")[0]  # Take first line

In [None]:
# 6. Strategy Implementations


def extract_pure_regex(doc):
    """Strategy 1: Pure Regex Matching"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        text = doc.get(section, "")
        if not text:
            for q in questions:
                results[q["id"]] = None
            continue

        for q in questions:
            match = re.search(q["extract_regex"], str(text), re.DOTALL)
            if match:
                # Clean up match
                clean_ans = match.group(1).strip(" .,;")[:150]
                results[q["id"]] = clean_ans
            else:
                results[q["id"]] = None
    return results


def extract_hybrid_llm(doc, model, tokenizer):
    """Strategy 2: Hybrid (Lax Keyword Locator + LLM Extraction)"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        full_text = doc.get(section, "")

        for q in questions:
            # 1. Locate relevant generic context
            # Use keywords to find the 1000 char window
            context = get_focused_context(full_text, q["keywords"])

            # 2. Extract with LLM
            answer = ask_llm(context, q["prompt"], model, tokenizer)
            results[q["id"]] = answer

    return results


def extract_pure_llm(doc, model, tokenizer, max_context_chars=12000):
    """Strategy 3: Pure LLM (Feed entire section context)"""
    results = {}
    for section, questions in QUESTION_BANK.items():
        full_text = doc.get(section, "")
        if not full_text:
            for q in questions:
                results[q["id"]] = None
            continue

        # Truncate to fit in context if necessary (simple truncation)
        context = full_text[:max_context_chars]

        # Note: For just LLM setup, might run one big prompt asking for ALL fields at once to save tokens.
        for q in questions:
            answer = ask_llm(context, q["prompt"], model, tokenizer)
            results[q["id"]] = answer

    return results

In [None]:
# 7. The Tournament Loop Function
# Runs detailed comparison on a small batch of documents

def create_df_tournament(model, tokenizer, TARGET_DOCS=5):
    print("Loading Data Stream...")
    dataset = load_dataset(
        "c3po-ai/edgar-corpus",
        "default",
        split="train",
        streaming=True,
        revision="refs/convert/parquet",
    )

    comparison_data = []

    print(f"Running Tournament on {TARGET_DOCS} docs...")

    for i, doc in enumerate(dataset):
        if i >= TARGET_DOCS:
            break

        print(f"Processing {doc.get('filename', f'Doc {i}')}...")

        res_regex = extract_pure_regex(doc)
        res_hybrid = extract_hybrid_llm(doc, model, tokenizer)
        res_pure = extract_pure_llm(doc, model, tokenizer)

        # Combine results
        row = {"Filename": doc.get("filename")}

        # Add all fields dynamically (Regex, Hybrid, PureLLM)
        # iterate over all the extracted keys to ensure we get everything
        all_keys = list(res_regex.keys())

        for key in all_keys:
            # Shorten key for column name width
            short_key = (
                key.replace("incorporation_", "Inc_")
                .replace("headquarters_", "HQ_")
                .replace("company_", "Hz_")
            )

            row[f"{short_key}_Re"] = res_regex.get(key)
            row[f"{short_key}_Hy"] = res_hybrid.get(key)
            row[f"{short_key}_Lu"] = res_pure.get(key)

        comparison_data.append(row)

    df_tournament = pd.DataFrame(comparison_data) 
    df_tournament.fillna("NULL", inplace=True)
    print("\n--- TOURNAMENT RESULTS ---")
    display(df_tournament)
    return df_tournament

In [None]:
# 8. Run the tournament and save the results to a CSV file. 

# sample so only do 5 
num_of_docs = 5 
df_sample = create_df_tournament(model, tokenizer)  
print("\n--- Test Run Preview ---") 
df_sample.head()  
# Feel free to add code blocks below to analyze the results

In [None]:
# 9. If fine create sample doc csv
df_sample.to_csv("edgar_tournament_sample.csv", index=False) 

In [None]:
# 10. If fine create full doc csv 
df_full = create_df_tournament(model, tokenizer, 150)
df_full.to_csv("edgar_tournament_full.csv", index=False)