This is a script designed to extract ground truth from Edgar Dataset - specifically these categories might be good points that are easy to find and test:

- "In which U.S. state was this company incorporated?"
- "In what year was this company incorporated?"
- "How many full-time employees does the company have?"
- "On what date does the company's fiscal year end?"
- "In which state are the company's principal executive offices located?"
- "Which independent registered public accounting firm audited these financial statements?"


In [20]:
# importing relevant libraries
import re
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

dataset_name = "c3po-ai/edgar-corpus"
config_name = "default"
split_name = "train"




In [21]:
# will match even if the text spans multiple lines.
QUESTION_BANK = {
    "section_1": [
        {
            "id": "incorporation_state",
            "question": "In which U.S. state was this company incorporated?",
            # Improved slightly to handle extra whitespace/newlines
            "extract_regex": r"(?i)incorporated (?:under the laws of|in) (?:the state of\s*)?([A-Z][a-z]+(?: [A-Z][a-z]+)*)"
        },
        {
            "id": "incorporation_year",
            "question": "In what year was this company incorporated?",
            "extract_regex": r"(?i)(?:incorporated|founded|organized).*?in (19\d{2}|20\d{2})"
        },
        {
            "id": "employee_count",
            "question": "How many full-time employees does the company have?",
            "extract_regex": r"(?i)(?:approximately|approx\.|had|total of)\s+([0-9,]+)\s+(?:full-time|part-time)?\s*employees"
        },
        {
            "id": "fiscal_year_end",
            "question": "On what date does the company's fiscal year end?",
            "extract_regex": r"(?i)fiscal year end(?:ed|s)(?:\s+on)?\s+([A-Z][a-z]+ \d{1,2})"
        },
        {
            "id": "headquarters_state",
            "question": "In which state are the company's principal executive offices located?",
           "extract_regex": r"(?i)executive offices.*?,[\s\r\n]+([A-Z][a-z]+(?: [A-Z][a-z]+)*)[\s\r\n]+\d{5}"
        }
    ],
    "section_8": [
        {
            "id": "auditor_name",
            "question": "Which independent registered public accounting firm audited these financial statements?",
            "extract_regex": r"(?i)Report of (?:Independent )?Registered Public Accounting Firm.*?(?:[\r\n]+)\s+([A-Z][&A-Za-z \.,]+(?:LLP|LLC))"
        }
    ]
}

In [22]:
from datasets import get_dataset_config_names

dataset_name = "c3po-ai/edgar-corpus"
revision_branch = "refs/convert/parquet"

# Get list of configs for the specific Parquet branch
configs = get_dataset_config_names(
    dataset_name,
    revision=revision_branch
)

print(f"Available configs: {configs}")

Resolving data files:   0%|          | 0/78 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/36 [00:00<?, ?it/s]

Available configs: ['default']


In [23]:
def extract_fields(record):
    # Base metadata
    data = {
        "cik": record.get("cik"),
        "year": record.get("year"),
        "filename": record.get("filename")
    }

    found_count = 0

    for section_col, questions in QUESTION_BANK.items():
        # Get section text (handle missing sections safely)
        section_text = record.get(section_col, "")

        if not section_text:
            for q in questions: data[q["id"]] = None
            continue

        for q in questions:
            # Run Regex (DOTALL allows matching across newlines)
            match = re.search(q["extract_regex"], str(section_text), re.DOTALL | re.IGNORECASE)
            if match:
                data[q["id"]] = match.group(1).strip(" .,;")
                found_count += 1
            else:
                data[q["id"]] = None

    return data, found_count


In [24]:
# --- 2. THE REUSABLE FUNCTION ---
def run_extraction_job(target_good_docs, max_scan_limit, output_filename):
    """
    Scans the EDGAR dataset until it finds 'target_good_docs' valid records,
    or hits 'max_scan_limit'. Saves the result to 'output_filename'.
    """
    print(f"\n>>> STARTING JOB: Target={target_good_docs}, Max Scan={max_scan_limit}")

    # Reload stream fresh for every job so we start from the top (or resume depending on iterable nature)
    # Note: If you want to continue where you left off, you'd need an iterator outside this function.
    # For now, we assume fresh start for simplicity.
    dataset = load_dataset(
        dataset_name,
        config_name,
        split=split_name,
        streaming=True,
        revision=revision_branch
    )

    results = []
    scanned_count = 0

    with tqdm(total=target_good_docs, desc=f"Saving to {output_filename}") as pbar:
        for doc in dataset:
            scanned_count += 1

            # 1. Extract
            extracted_data, found_count = extract_fields(doc)

            # 2. Filter (Must have >= 2 fields found)
            if found_count >= 2:
                results.append(extracted_data)
                pbar.update(1)

            # 3. Stop conditions
            if len(results) >= target_good_docs:
                break
            if scanned_count >= max_scan_limit:
                print(f"\nHit max scan limit ({max_scan_limit}). Stopping early.")
                break

    # Save logic
    if len(results) > 0:
        df = pd.DataFrame(results)
        df.to_csv(output_filename, index=False)
        print(f"SUCCESS: Saved {len(df)} rows to '{output_filename}'")
        print(f"Hit Rate: {len(df)/scanned_count:.1%}")
        return df
    else:
        print("FAILURE: No valid documents found.")
        return pd.DataFrame()

In [25]:
# Run small test: 5 good docs, max scan 200
df_test = run_extraction_job(
    target_good_docs=5,
    max_scan_limit=200,
    output_filename="test_run.csv"
)

# Verify results
print("\n--- Test Run Preview ---")
print(df_test.head())


>>> STARTING JOB: Target=5, Max Scan=200


Resolving data files:   0%|          | 0/78 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/36 [00:00<?, ?it/s]

Saving to test_run.csv: 100%|██████████| 5/5 [00:01<00:00,  4.46it/s]

SUCCESS: Saved 5 rows to 'test_run.csv'
Hit Rate: 71.4%

--- Test Run Preview ---
      cik  year         filename    incorporation_state incorporation_year  \
0  103730  1993  103730_1993.txt            Delaware in               1962   
1  100240  1993  100240_1993.txt             Georgia in               1965   
2   46207  1993   46207_1993.txt  the Kingdom of Hawaii               1981   
3   60041  1993   60041_1993.txt            Delaware in               1953   
4   55387  1993   55387_1993.txt                   None               1994   

  employee_count fiscal_year_end headquarters_state auditor_name  
0            172            None       Pennsylvania         None  
1          5,317            None               None         None  
2           None            None               None         None  
3          4,000            None               None         None  
4          2,260            None               None         None  





In [None]:
# Run full job: 500 good docs, max scan 10,000 (run after sanity check/test)
df_full = run_extraction_job(
    target_good_docs=500,
    max_scan_limit=10000,
    output_filename="edgar_ground_truth.csv"
)

print("\n--- Full Run Preview ---")
print(df_full.head())