# EDGAR Ground Truth Single Feature Extraction/Validation

**Goal**: Validate specific features against Ground Truth with high precision.

**Methodology**:
1. **Protocol A (Extraction)**: Use LLM to extract `Value ||| Evidence`. The evidence (quote) is required.
2. **Protocol B (Validation)**: Compare extracted value with Ground Truth using Fuzzy Matching.
3. **Protocol C (Judge)**: (Optional) Use LLM to judge specific ambiguous cases.
4. **Protocol D (Discovery)**: Bootstrap Ground Truth for NEW features by extracting and manually reviewing evidence.


In [None]:
# 1. Installation & Setup
# !pip install -q torch transformers datasets pandas tqdm accelerate bitsandbytes thefuzz python-Levenshtein

import os
import re
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

# Fuzzy Matching for Validation
from thefuzz import fuzz

pd.set_option('display.max_colwidth', None)
print("Setup Complete.")

In [None]:
# 2. Load Model (Qwen 2.5-7B-Instruct)
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, logging

logging.set_verbosity_error()

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
DTYPE = torch.bfloat16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=DTYPE,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
print("Loaded:", MODEL_NAME, "| device:", DEVICE)

In [None]:
# 3. Configuration & Question Bank

# TARGET FEATURE to Run (Change this to switch focus)
TARGET_FEATURE_ID = "company_name"
GT_COLUMN_MAPPING = {
    "company_name": "company_name_truth",
    "original_incorporation_state": "original_inc_state_truth",
    "original_incorporation_year": "original_inc_year_truth",
    "employee_count": "employee_count_truth",
}

QUESTION_BANK = {
    "section_1": [
        {
            "id": "company_name",
            "prompt": (
                "What is the full legal name of the company? "
                "Look for the entity being introduced (e.g., 'X Corp (the Company)'). "
                "Exclude headers like 'Item 1' or 'General'. "
                "Answer with ONLY the legal name."
            ),
        },
        # --- MENTOR'S REQUEST: HQ CITY ---
        {
            "id": "headquarters_city",
            "prompt": (
                "What **city** are the company's executive offices located in? "
                "Look for the phrase 'executive offices are located at'. "
                "Answer with ONLY the city name (one word if possible)."
            ),
        },
        # --- STATE: ORIGINAL (Not Current) ---
        {
            "id": "original_incorporation_state",
            "prompt": (
                "In which U.S. state was the company originally incorporated or organized? "
                "1. Look for 'originally incorporated in [State]' or 'organized as a [State] corporation'. "
                "2. If it mentions 'reincorporated in Delaware', IGNORE Delaware and find the PREVIOUS state. "
                "3. If no history is mentioned, return the current state. "
                "Answer with ONLY the state name."
            ),
        },
        # --- YEAR: ORIGINAL (Not Re-inc) ---
        {
            "id": "original_incorporation_year",
            "prompt": (
                "In what year was the company originally incorporated or organized? "
                "Ignore reincorporation dates. Answer with the EARLIEST year found (YYYY). "
                "If no other history is mentioned, return the year mentioned."
            ),
        },
        # --- EMPLOYEES: FULL-TIME (Exclude Enrollment) OR TOTAL ---
        {
            "id": "employee_count",
            "prompt": (
                "How many full-time employees does the company have? "
                "1. Exclude 'enrollment', 'members', or 'agents'. "
                "2. If full-time is not specified, return Total. "
                "Answer with ONLY the integer."
            ),
        },
        {
            "id": "headquarters_state",
            "prompt": "In which U.S. state are the company's principal executive offices located? Answer with ONLY the state name. If not found, answer NULL.",
        },
    ],
    "section_10": [
        {
            "id": "ceo_lastname",
            "prompt": "What is the Last Name of the current CEO? Answer with ONLY the last name. If not found, answer NULL.",
        }
    ],
}

In [None]:
# 4. Protocol A: Evidence-First Extraction

def ask_llm_with_evidence(context, prompt, model, tokenizer, max_new_tokens=100):
    """
    Asks LLM to output: VALUE ||| EVIDENCE
    Returns: (Value, Evidence)
    """
    if not context or not context.strip():
        return None, None

    # Standard Limit check (same as tournament code)
    model_limit = getattr(tokenizer, "model_max_length", 32768) 
    available_for_context = model_limit - 1000 # Reserve more for evidence output
    char_limit = available_for_context * 4 
    if len(context) > char_limit:
        context = context[:char_limit]

    system_prompt = (
        "Do not answer unless you can quote the exact sentence from the text. "
        "Output Format: VALUE ||| EVIDENCE"
    )
    
    user_prompt = (
        f"Read this text: \"{context}\"\n\n"
        f"Question: {prompt}\n\n"
        "Rules:\n"
        "1. Find the exact sentence supporting the answer.\n"
        "2. Extract the short value (e.g., 'Delaware').\n"
        "3. Output ONLY: Value ||| Evidence\n"
        "4. If not found, output: NULL ||| NULL"
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.0)
    
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    # Parse Output
    clean_resp = response.strip().split('\n')[0]
    parts = clean_resp.split("|||")
    
    if len(parts) >= 2:
        val = parts[0].strip()
        evd = parts[1].strip()
        if val.lower() == "null": return None, None
        return val, evd
        
    # Fallback Parsing
    if "null" in clean_resp.lower(): return None, None
    return clean_resp, "FORMAT_ERROR"


def extract_feature(doc, feature_id, model, tokenizer):
    """Extracts a specific feature using the Evidence-First protocol."""
    # Find the prompt in QUESTION_BANK
    prompt = None
    target_section = "section_1" # Default
    
    for section, questions in QUESTION_BANK.items():
        for q in questions:
            if q["id"] == feature_id:
                prompt = q["prompt"]
                target_section = section
                break
    
    if not prompt:
        print(f"Warning: Feature ID '{feature_id}' not found in Question Bank.")
        return None, None
        
    text = doc.get(target_section, "")
    return ask_llm_with_evidence(text, prompt, model, tokenizer)

In [None]:
# 5. Protocol B: Validation Logic

def validate_row(val_ext, val_gt):
    """
    Compares Extracted Value vs Ground Truth.
    Returns: Score (0-100), Status (1=Pass, 0=Fail, ?=Ambiguous)
    """
    if pd.isna(val_ext) and pd.isna(val_gt): return 100, "1" # Both Null
    if pd.isna(val_ext) or pd.isna(val_gt): return 0, "0"   # One Null
    
    # Normalize
    s1 = str(val_ext).lower().strip().strip(".")
    s2 = str(val_gt).lower().strip().strip(".")
    
    # Exact Match
    if s1 == s2: return 100, "1"
    
    # Fuzzy Match
    score = fuzz.ratio(s1, s2)
    if score > 90: return score, "1"
    if score < 50: return score, "0"
    return score, "?" # Ambiguous

def run_validation_loop(target_feature, gt_csv_path, limit=10):
    print(f"--- STARTING VALIDATION FOR: {target_feature} ---")
    
    # 1. Load Ground Truth
    df_gt = pd.read_csv(gt_csv_path)
    gt_col = GT_COLUMN_MAPPING.get(target_feature)
    
    if not gt_col or gt_col not in df_gt.columns:
        print(f"Error: Ground Truth column '{gt_col}' not found in CSV.")
        return pd.DataFrame()
        
    # 2. Load Documents (Streaming with Fix for Parquet Revision)
    dataset = load_dataset("c3po-ai/edgar-corpus", "default", split="train", streaming=True, revision="refs/convert/parquet")
    
    results = []
    
    count = 0
    for i, doc in enumerate(dataset):
        fname = doc.get('filename')
        # Ineffecient for streaming but necessary if GT is sparse
        if fname not in df_gt['filename'].values:
            continue 
            
        if count >= limit: break
        count += 1
        
        print(f"Processing {fname}...")
        
        # Extract
        val_ext, evd_ext = extract_feature(doc, target_feature, model, tokenizer)
        
        # Get Truth
        val_gt = df_gt.loc[df_gt['filename'] == fname, gt_col].values[0]
        
        # Validate
        score, status = validate_row(val_ext, val_gt)
        
        results.append({
            "Filename": fname,
            "GT_Value": val_gt,
            "Ext_Value": val_ext,
            "Protocol_B_Score": score,
            "Protocol_B_Status": status,
            "Evidence": evd_ext
        })
        
    return pd.DataFrame(results)

In [None]:
# 6. Run Validation

# Update path to your actual GT CSV location
GT_PATH = "../csvs/ground_truth/Ground Truth Data - edgar_gt_verified_slim.csv"

df_val = run_validation_loop(
    target_feature=TARGET_FEATURE_ID, # Defined in Config cell
    gt_csv_path=GT_PATH,
    limit=5
)

# Display Results
print("\n--- VALIDATION RESULTS ---")
display(df_val)

In [None]:
# 7. Protocol C: The Judge (Binary 1/0)
# Run this cell to resolve ambiguous cases ('?') from Protocol B

def judge_row(row):
    # If it was already a clear pass (1) or fail (0), keep it.
    if row['Protocol_B_Status'] in ['1', '0']:
        return row['Protocol_B_Status']
        
    # Construct Judge Prompt for '?' Ambiguous cases
    prompt = (
        f"Ground Truth says: '{row['GT_Value']}'. "
        f"Extraction says: '{row['Ext_Value']}' based on evidence '{row['Evidence']}'. "
        "Is the extraction factually correct despite the text mismatch? "
        "Answer 1 for Yes, 0 for No."
    )
    
    # Placeholder: In a real run, send this to LLM.
    # For this demo, we can just return '0' or assume fail.
    return "0" 

if not df_val.empty:
    df_val['Judge_Valid'] = df_val.apply(judge_row, axis=1)
    display(df_val[[ 'GT_Value', 'Ext_Value', 'Protocol_B_Status', 'Judge_Valid', 'Evidence']])

## Protocol D: Discovery Mode (New Feature Extraction)

**Goal**: Extract a NEW feature that has no Ground Truth yet.

**Output**: Creates or Updates `discovered_GT.csv` so you can manually review the evidence and mark it as truth.

In [None]:
def discover_new_feature(target_feature, output_file="discovered_GT.csv", limit=50):
    print(f"--- STARTING DISCOVERY FOR: {target_feature} ---")

    update = False
    # 1. Load Existing CSV if available (Smart Append)
    if os.path.exists(output_file):
        print(f"Loading existing file: {output_file}")
        df_master = pd.read_csv(output_file)
        update = True
    else:
        print("Creating NEW discovery file.")
        df_master = pd.DataFrame(columns=["filename", "cik", "year"])

    # 2. Stream Data
    dataset = load_dataset(
        "c3po-ai/edgar-corpus",
        "default",
        split="train",
        streaming=True,
        revision="refs/convert/parquet",
    )

    new_rows = []
    updates = 0

    count = 0
    for doc in tqdm(dataset, total=limit):
        if count >= limit:
            break
        count += 1

        fname = doc.get("filename")

        # Check if we already have this file in our Master CSV
        existing_idx = df_master.index[df_master["filename"] == fname].tolist()

        # Extract Data
        val, evd = extract_feature(doc, target_feature, model, tokenizer)

        # Prepare data dict
        data = {f"{target_feature}_value": val, f"{target_feature}_evidence": evd}

        if existing_idx:
            # UPDATE existing row
            idx = existing_idx[0]
            for k, v in data.items():
                df_master.at[idx, k] = v
            updates += 1
        else:
            # CREATE new row
            # Ensure we capture identity columns
            row = {
                "filename": fname,
                "cik": doc.get("cik"),
                "year": doc.get("year"),
                **data,
            }
            new_rows.append(row)

    # 3. Save Results
    if new_rows:
        df_new = pd.DataFrame(new_rows)
        # Merge
        df_master = pd.concat([df_master, df_new], ignore_index=True)

    print(
        f"\nDiscovery Complete. Updated {updates} rows. Added {len(new_rows)} new rows."
    )

    filename = output_file
    if update:
        filename = output_file.replace(".csv", "_updated.csv")
    df_master.to_csv(filename, index=False)
    return df_master.head()

In [None]:
# Example: Discover 'fiscal_year_end'
# Ensure you add 'fiscal_year_end' to QUESTION_BANK above first!

# UNCOMMENT TO RUN:
# df_disc = discover_new_feature("fiscal_year_end", limit=5)
# display(df_disc)