# Edgar Testing No Context Recall Experiment In Different Years 

**Goal:** To evaluate the performance of large language model ins recalling ground truth (Extractive QA) given no context beyond registrant name and  across different years in the Edgar Dataset.  

Should be able to: 
- Compare performance across different years (Being able to pick specific years to compare) 
- Compare performance between different models (Qwen, Llama, etc) 
- Compare performance between different questions (state of incorporation, year of incorporation, year, etc) 
- Compare performance with no context vs with context 
- Visualize results (simple bar charts)


In [None]:
# Install Dependencies
#%env HF_TOKEN=your_token
#!pip install -q datasets torch pandas tqdm thefuzz python-Levenshtein transformers huggingface_hub accelerate bitsandbytes 

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset 
from transformers import AutoModelForCausalLM, AutoTokenizer, logging, BitsAndBytesConfig
from huggingface_hub import login 

logging.set_verbosity_error() 

# Set display options
pd.set_option('display.max_colwidth', None)

# Setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
login(token=os.environ["HF_TOKEN"]) # or login(token="your_token")   

print(f"Running on: {DEVICE}")
print("Setup Complete.")

In [None]:
# Experiment Configuration
# Modify these settings to change the experiment focus

CONFIG = {
    # Resources 
    # Test with these models Qwen/Qwen2.5-7B-Instruct, meta-llama/Meta-Llama-3-8B-Instruct, meta-llama/Llama-3.1-8B-Instruct, or some other huggingface model. 
    "model_id": "Qwen/Qwen2.5-7B-Instruct",
    "output_file": "experiment_results_qwen_nocontext_1993.csv",
    
    # Experiment Mode
    # 'NO_CONTEXT': Ask model about specific company/year facts using only its internal weights.
    # 'CONTEXT': Provide full 10-K text (Extractive QA).
    "mode": "NO_CONTEXT", 
    
    # Ground Truth Data
    # Path to a CSV containing ground truth data.
    "local_data_path": "../data/ground_truth/v2_250_(1993)_(1-18-2025).csv", 
    
    # Filters
    "target_years": [1993], 
    "limit_samples": 250, # Set None for all
    
    # Generation
    "generation_params": {
        "max_new_tokens": 100,
        "do_sample": False, # Deterministic
        "temperature": None
    }
}

print(f"Configured for Mode: {CONFIG['mode']} on Years: {CONFIG['target_years']}")

In [None]:
# Load da Data
def build_experiment_dataset(config):
    """
    Creates a generator that yields:
    (doc_from_hf, gt_row_from_csv)
    
    This ensures we have both the full text (from HF) and the verified truth labels (from CSV).
    """
    # 1. Load Ground Truth CSV to memory (contains labels + filenames)
    if not os.path.exists(config["local_data_path"]):
        raise FileNotFoundError(f"GT file not found: {config['local_data_path']}")
        
    df_gt = pd.read_csv(config["local_data_path"])
    
    # Filter by Year if specified
    if config["target_years"]:
        df_gt = df_gt[df_gt["year"].isin(config["target_years"])]
        
    # Create lookup map: filename -> row
    gt_lookup = df_gt.set_index("filename").to_dict("index")
    target_filenames = set(gt_lookup.keys())
    
    print(f"Loaded GT CSV. Targets: {len(target_filenames)} documents.")

    # 2. Stream HF Dataset to match filenames
    dataset = load_dataset(
        "c3po-ai/edgar-corpus",
        "default",
        split="train",
        streaming=True,
        revision="refs/convert/parquet",
    )
    
    matched_count = 0
    limit = config.get("limit_samples")
    
    for doc in dataset:
        fname = doc.get("filename")
        if fname in target_filenames:
            # combine doc (text) with gt_row (labels)
            gt_data = gt_lookup[fname]
            yield doc, gt_data
            
            matched_count += 1
            if limit and matched_count >= limit:
                break

In [None]:
def setup_model(config):
    """Loads the model and tokenizer."""
    model_id = config["model_id"]
    
    print(f"Loading model: {model_id}...")
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_type=torch.float16
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    
    print("Model loaded.")
    return model, tokenizer

In [None]:
# Question Bank & Prompt Generation

SECTION_KEYS = [
    "section_1", "section_1A", "section_1B", "section_2", "section_3",
    "section_4", "section_5", "section_6", "section_7", "section_7A",
    "section_8", "section_9", "section_9A", "section_9B", "section_10",
    "section_11", "section_12", "section_13", "section_14", "section_15"
]

GT_COLUMN_MAP = { 
    "headquarters_city": "headquarters_city_truth",
    "headquarters_state": "headquarters_state_truth",
    "incorporation_state": "original_Inc_state_truth",
    "incorporation_year": "original_Inc_year_truth",
    "employee_count": "employee_count_truth",    
    "holder_record_amount": "holder_record_amount_truth"
}

QUESTION_BANK = [
    {"id": "headquarters_state", "prompt": "What U.S. state is the location of the registrant's principal executive offices?"},
    {"id": "incorporation_state", "prompt": "What is the registrant's state of incorporation?"},
    {"id": "incorporation_year", "prompt": "What is the registrant's year of incorporation?"},
    {"id": "employee_count", "prompt": "What is the number of employees that the registrant has?"},  
    {"id": "headquarters_city", "prompt": "What city is explicitly stated as the location of the registrant's principal executive offices?"}, 
    {"id": "holder_record_amount", "prompt": "What is the number of holders of record of the registrant's common stock?"}
]

def build_full_context(doc):
    """Concatenates all non-empty sections from the document."""
    parts = [
        f"\n\n--- [{key.upper()}] ---\n\n{doc.get(key, '')}"
        for key in SECTION_KEYS if doc.get(key, "").strip()
    ]
    return "".join(parts)

def format_prompt(doc, gt_data, question_item, mode, tokenizer):
    """Builds the chat template prompt based on mode."""
    q_text = question_item["prompt"]

    if mode == "NO_CONTEXT":
        company = gt_data.get("registrant_name_truth", "the company")
        year = gt_data.get("year", "N/A")
        
        user_content = (
            f"Answer based on your internal knowledge.\n"
            f"Context: {year} SEC 10-K filing for '{company}'.\n"
            f"Question: {q_text}\n"
            f"Return ONLY the value. If unknown, return NOT_FOUND."
        )
        system_content = "You are a financial data assistant. Be concise."

    else:
        # Hopefully works praying ðŸ˜­ðŸ˜­
        full_text = build_full_context(doc)
        context_window = full_text[:150000]

        user_content = (
            f"Read this SEC 10-K filing and answer the question.\n\n"
            f"Question: {q_text}\n\n"
            f"Text:\n{context_window}\n\n"
            f"Return ONLY the extracted value."
        )
        system_content = "You are a precise extraction assistant."

    messages = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": user_content}
    ]
    
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [None]:
# 6. Execution Engine
import matplotlib.pyplot as plt

def normalize_answer(text):
    """Basic normalization for comparison."""
    if not text:
        return ""
    return str(text).lower().replace(",", "").replace(".", "").strip()


def check_accuracy(prediction, truth):
    """
    Simple exact/substring match checker.
    Returns: IsCorrect (bool)
    """
    p = normalize_answer(prediction)
    t = normalize_answer(truth)

    if p == "not_found" or t == "null" or t == "nan":
        return False

    # Exact match or truth inside prediction (e.g. "Delaware" inside "State of Delaware")
    return t in p or p == t


In [None]:
def plot_accuracy(stats, config):
    """Plots accuracy breakdown by question type."""
    labels = list(stats.keys())
    values = [(d["correct"] / d["total"] * 100) if d["total"] > 0 else 0 for d in stats.values()]

    plt.figure(figsize=(10, 6))

    #cool color
    bars = plt.bar(labels, values, color="#4c72b0")
    
    plt.title(f"Accuracy by Field\n{config['model_id']} | {config['mode']} | Year: {config['target_years']}")
    plt.ylabel("Accuracy (%)")
    plt.ylim(0, 100)
    plt.xticks(rotation=45, ha="right")
    
    #lowkey chatted this because it was hard to get the graphs looking right, how i wanted
    for bar in bars:
        h = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., h + 1, f'{h:.1f}%', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

In [None]:
def run_experiment(config):   
    '''
    Docstring for run_experiment 

    Runs the main experiment loop: 

    '''
    model, tokenizer = setup_model(config)
    dataset = build_experiment_dataset(CONFIG)

    results = []
    stats = {q["id"]: {"correct": 0, "total": 0} for q in QUESTION_BANK}
    
    print(f"--- Running Experiment: {config['mode']} ---")
    
    for i, (doc, gt_data) in enumerate(tqdm(dataset, total=config.get("limit_samples"))):
        row = {
            "filename": doc["filename"], 
            "company": gt_data.get("registrant_name_truth"),
            "year": gt_data.get("year")
        }
        
        for q in QUESTION_BANK:
            # 1. Generate
            prompt = format_prompt(doc, gt_data, q, config["mode"], tokenizer)
            inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    pad_token_id=tokenizer.eos_token_id,
                    **config["generation_params"]
                )
            
            pred = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
            
            # 2. Evaluate
            truth = gt_data.get(GT_COLUMN_MAP.get(q["id"]), "N/A")
            is_correct = check_accuracy(pred, truth)
            
            # 3. Stats & Logging
            stats[q["id"]]["total"] += 1
            if is_correct: stats[q["id"]]["correct"] += 1
            
            row[f"{q['id']}_pred"] = pred
            row[f"{q['id']}_truth"] = truth
            row[f"{q['id']}_correct"] = is_correct
            
        results.append(row)
        
        # Periodic Save
        if len(results) % 5 == 0:
            pd.DataFrame(results).to_csv(config["output_file"], index=False)
            
    # Final Output
    df_res = pd.DataFrame(results)
    df_res.to_csv(config["output_file"], index=False)
    print(f"Results saved to {config['output_file']}")
    
    plot_accuracy(stats, config)
    return df_res


# Run it
df_results = run_experiment(CONFIG)

### Feel free to explore different df_results by changing CONFIG and re-running the experiment! 

```python 

CONFIG = {
    # Resources
    "model_id": "qwen/Qwen-2.5-32B-Instruct",
    "output_file": "experiment_results_qwen_nocontext_2022.csv",
    
    # Experiment Mode
    # 'NO_CONTEXT': Ask model about specific company/year facts using only its internal weights.
    # 'CONTEXT': Provide full 10-K text (Extractive QA).
    "mode": "NO_CONTEXT", 
    
    # Ground Truth Data
    # Path to a CSV containing ground truth data.
    "local_data_path": "../data/ground_truth/v1_250_1-6-2025.csv", 
    
    # Filters
    "target_years": [1993], 
    "limit_samples": 250, # Set None for all
    
    # Generation
    "generation_params": {
        "max_new_tokens": 100,
        "do_sample": False, # Deterministic
        "temperature": None
    }
}
```

In [None]:
CONFIG["model_id"] = "meta-llama/Meta-Llama-3-8B-Instruct"  
CONFIG["output_file"] = "experiment_results_llama3_nocontext_1993.csv" 
df_results = run_experiment(CONFIG)