This is a script designed to extract ground truth from Edgar Dataset - specifically these categories might be good points that are easy to find and test:

- "In which U.S. state was this company incorporated?"
- "In what year was this company incorporated?"
- "How many full-time employees does the company have?"
- "On what date does the company's fiscal year end?"
- "In which state are the company's principal executive offices located?"
- "Which independent registered public accounting firm audited these financial statements?" (dropped, this is really hard)


In [None]:
!pip install -q torch transformers datasets pandas tqdm accelerate

In [None]:
# importing relevant libraries
import os
import re
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
# Configure display to show full text in pandas
pd.set_option('display.max_colwidth', None)
dataset_name = "c3po-ai/edgar-corpus"
config_name = "default"
split_name = "train"




In [None]:
# 3. Unified Configuration (Question Bank)
# Contains logic for ALL strategies: Regex Patterns, Locator Keywords, and LLM Prompts.

QUESTION_BANK = {
    "section_1": [
        {
            "id": "incorporation_state",
            "prompt": "In which U.S. state was this company incorporated? Answer with ONLY the state name.",
            # Regex: Matches "Incorporated in the State of Delaware"
            "extract_regex": r"(?i)(?:[Ii]ncorporated|[Oo]rganized)(?: (?:under the laws of|in))? (?:the [Ss]tate of\s*)?([A-Z][a-z]+(?: [A-Z][a-z]+)*)",
            # Hybrid Locator: Keywords to find the paragraph
            "keywords": ["incorporated", "organized under", "laws of the state", "formed under"]
        },
        {
            "id": "incorporation_year",
            "prompt": "In what year was this company incorporated? Answer with ONLY the year.",
            # Regex: Matches "incorporated ... in 1985"
            "extract_regex": r"(?i)(?:incorporated|founded|organized).*?in (19\d{2}|20\d{2})",
            "keywords": ["incorporated", "founded", "organized", "formed", "year"]
        },
        {
            "id": "employee_count",
            "prompt": "How many full-time employees does the company have? Answer with ONLY the number.",
            # Regex: Matches "approximately 5,000 employees"
            "extract_regex": r"(?i)(?:approximately|approx\.|had|total of|employ)\s+([0-9,]+)(?:\s+full-time)?\s+employees",
            "keywords": ["employees", "full-time", "employed", "workforce", "persons"]
        },
        {
            "id": "fiscal_year_end",
            "prompt": "On what date does the company's fiscal year end? Answer with Month and Day (e.g., 'December 31').",
            # Regex: Matches "fiscal year ends December 31"
            "extract_regex": r"(?i)fiscal year end(?:ed|s)(?:\s+on)?\s+([A-Z][a-z]+ \d{1,2})",
            "keywords": ["fiscal year end", "fiscal year ends", "fiscal year ended"]
        },
        {
            "id": "company_product",
            "prompt": "What is the main product, service, or business activity of this company? Answer in 2-5 words.",
            # Regex: Matches "engaged in the business of..."
            "extract_regex": r"(?i)engaged in the (?:business|manufacture|sale|development) of ([^.;]+)",
            "keywords": ["engaged in", "business of", "manufacture", "sale of", "products"]
        }
    ],
    "section_2": [
         {
            "id": "headquarters_state",
            "prompt": "In which U.S. state are the company's principal executive offices located? Answer with ONLY the state name.",
            # Regex: Matches "executive offices ... [State] [Zip]"
            "extract_regex": r"(?i)executive offices.*?,[\s\r\n]+([A-Z][a-z]+(?: [A-Z][a-z]+)*)[\s\r\n]+\d{5}",
            "keywords": ["executive offices", "headquarters", "principal offices"]
        }
    ],
    "section_10": [
       {
            "id": "ceo_lastname",
            "prompt": "What is the Last Name of the current CEO? Answer with ONLY the last name.",
            # Regex: Matches "Mr. Smith ... CEO"
            "extract_regex": r"(?i)(?:Mr\.|Ms\.|Mrs\.|Dr\.)\s+([A-Z][a-z]+).*?(?:Chief Executive Officer|CEO)",
            "keywords": ["chief executive officer", "ceo", "serves as"]
        },
        {
            "id": "ceo_fullname_backup",
            "prompt": "What is the Full Name of the current CEO?",
             # Regex: Matches Name followed by CEO title
            "extract_regex": r"(?m)([A-Z][a-z]+ [A-Z]\.? [A-Z][a-z]+|[A-Z][a-z]+ [A-Z][a-z]+)\s*,?\s*(?:Chief Executive Officer|CEO)",
             "keywords": ["chief executive officer", "ceo"]
        }
    ]
}

In [5]:
from datasets import get_dataset_config_names

dataset_name = "c3po-ai/edgar-corpus"
revision_branch = "refs/convert/parquet"

# Get list of configs for the specific Parquet branch
configs = get_dataset_config_names(
    dataset_name,
    revision=revision_branch
)

print(f"Available configs: {configs}")

Available configs: ['default']


In [None]:
def extract_fields(record):
    # Base metadata
    data = {
        "cik": record.get("cik"),
        "year": record.get("year"),
        "filename": record.get("filename")
    }

    found_count = 0

    for section_col, questions in QUESTION_BANK.items():
        # Get section text (handle missing sections safely)
        section_text = record.get(section_col, "")

        if not section_text:
            for q in questions: data[q["id"]] = None
            continue

        for q in questions:
            # Run Regex (DOTALL allows matching across newlines)
            match = re.search(q["extract_regex"], str(section_text), re.DOTALL)
            if match:
            # Truncate 'Product' to 150 chars so it doesn't accidentally grab a whole paragraph
                clean_ans = match.group(1).strip(" .,;")[:150] 
                data[q["id"]] = clean_ans
                found_count += 1
            else:
                data[q["id"]] = None

    return data, found_count


In [None]:
# --- 2. THE REUSABLE FUNCTION ---
def run_extraction_job(target_good_docs, max_scan_limit, output_filename):
    """
    Scans the EDGAR dataset until it finds 'target_good_docs' valid records,
    or hits 'max_scan_limit'. Saves the result to 'output_filename'.
    """
    print(f"\n>>> STARTING JOB: Target={target_good_docs}, Max Scan={max_scan_limit}")


    dataset = load_dataset(
        dataset_name,
        config_name,
        split=split_name,
        streaming=True,
        revision=revision_branch
    )

    results = []
    scanned_count = 0

    with tqdm(total=target_good_docs, desc=f"Saving to {output_filename}") as pbar:
        for doc in dataset:
            scanned_count += 1

            # 1. Extract
            extracted_data, found_count = extract_fields(doc)

            # 2. Filter (Must have >= 2 fields found)
            if found_count >= 2:
                results.append(extracted_data)
                pbar.update(1)

            # 3. Stop conditions
            if len(results) >= target_good_docs:
                break
            if scanned_count >= max_scan_limit:
                print(f"\nHit max scan limit ({max_scan_limit}). Stopping early.")
                break

    # Save logic
    if len(results) > 0:
        df = pd.DataFrame(results) 
        df.fillna("NULL")
        df.to_csv(output_filename, index=False)
        print(f"SUCCESS: Saved {len(df)} rows to '{output_filename}'")
        print(f"Hit Rate: {len(df)/scanned_count:.1%}")
        return df
    else:
        print("FAILURE: No valid documents found.")
        return pd.DataFrame()

In [21]:
# Run small test: 5 good docs, max scan 200
df_test = run_extraction_job(
    target_good_docs=5,
    max_scan_limit=200,
    output_filename="test_run.csv"
)

# Verify results
print("\n--- Test Run Preview ---")
print(df_test.head())


>>> STARTING JOB: Target=5, Max Scan=200


Saving to test_run.csv: 100%|██████████| 5/5 [00:00<00:00,  7.99it/s]

SUCCESS: Saved 5 rows to 'test_run.csv'
Hit Rate: 71.4%

--- Test Run Preview ---
      cik  year         filename incorporation_state incorporation_year  \
0   92116  1993   92116_1993.txt          California               1929   
1  103730  1993  103730_1993.txt            Delaware               1962   
2  100240  1993  100240_1993.txt             Georgia               1965   
3   60041  1993   60041_1993.txt            Delaware               1953   
4   55387  1993   55387_1993.txt                None               1994   

  employee_count fiscal_year_end headquarters_state company_product  \
0           None            None               None            None   
1            172            None       Pennsylvania            None   
2          5,317            None               None            None   
3          4,000            None               None            None   
4          2,260            None               None            None   

  ceo_lastname ceo_fullname_backup  
0  




In [22]:
# Run full job: 500 good docs, max scan 10,000 (run after sanity check/test)
df_full = run_extraction_job(
    target_good_docs=500,
    max_scan_limit=10000,
    output_filename="edgar_ground_truth.csv"
)

print("\n--- Full Run Preview ---")
print(df_full.head())


>>> STARTING JOB: Target=500, Max Scan=10000


Saving to edgar_ground_truth.csv: 100%|██████████| 500/500 [00:02<00:00, 186.35it/s]

SUCCESS: Saved 500 rows to 'edgar_ground_truth.csv'
Hit Rate: 61.1%

--- Full Run Preview ---
      cik  year         filename incorporation_state incorporation_year  \
0   92116  1993   92116_1993.txt          California               1929   
1  103730  1993  103730_1993.txt            Delaware               1962   
2  100240  1993  100240_1993.txt             Georgia               1965   
3   60041  1993   60041_1993.txt            Delaware               1953   
4   55387  1993   55387_1993.txt                None               1994   

  employee_count fiscal_year_end headquarters_state company_product  \
0           None            None               None            None   
1            172            None       Pennsylvania            None   
2          5,317            None               None            None   
3          4,000            None               None            None   
4          2,260            None               None            None   

  ceo_lastname ceo_fullname_




In [23]:
df_full["incorporation_state"].unique()

array(['California', 'Delaware', 'Georgia', None, 'Virginia', 'Indiana',
       'Texas', 'Massachusetts', 'June', 'December', 'New Jersey',
       'February', 'Missouri', 'August', 'Pennsylvania', 'Kansas', 'Ohio',
       'Florida', 'North Carolina', 'New York', 'November', 'Maryland',
       'Colorado', 'Puerto Rico', 'Michigan', 'Washington', 'January',
       'Lehman Brothers Financial Products Inc', 'Wisconsin', 'Mass',
       'Registrant', 'March', 'Vermont', 'Utah', 'Tennessee', 'Louisiana',
       'Illinois', 'South Dakota', 'West Virginia', 'Iowa', 'Hawaii',
       'Kentucky', 'Nevada', 'Oregon', 'July', 'Minnesota', 'Ireland',
       'October', 'September', 'Alabama', 'May', 'Canada', 'Maine',
       'New Mexico', 'New York State', 'April'], dtype=object)

# Cells below will be only be executed if using a GPU

In [None]:
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto", 
    device_map="auto"
)
print("Model Loaded Successfully.")