# Archived Questions and Extraction Prompts 

This notebook contains archived questions and extraction prompts used initially to extract ground truth data from EDGAR corpus

In [None]:
# Archived Questions

QUESTION_BANK = {
    "section_1": [
        {
            "id": "incorporation_state",
            "prompt": "In which U.S. state was this company incorporated? Answer with ONLY the state name.",
            # Regex: Matches "Incorporated in the State of Delaware"
            "extract_regex": r"(?i)(?:[Ii]ncorporated|[Oo]rganized)(?: (?:under the laws of|in))? (?:the [Ss]tate of\s*)?([A-Z][a-z]+(?: [A-Z][a-z]+)*)",
            # Hybrid Locator: Keywords to find the paragraph
            "keywords": ["incorporated", "organized under", "laws of the state", "formed under"]
        },
        {
            "id": "incorporation_year",
            "prompt": "In what year was this company incorporated? Answer with ONLY the year.",
            # Regex: Matches "incorporated ... in 1985"
            "extract_regex": r"(?i)(?:incorporated|founded|organized).*?in (19\d{2}|20\d{2})",
            "keywords": ["incorporated", "founded", "organized", "formed", "year"]
        },
        {
            "id": "employee_count",
            "prompt": "How many full-time employees does the company have? Answer with ONLY the number.",
            # Regex: Matches "approximately 5,000 employees"
            "extract_regex": r"(?i)(?:approximately|approx\.|had|total of|employ)\s+([0-9,]+)(?:\s+full-time)?\s+employees",
            "keywords": ["employees", "full-time", "employed", "workforce", "persons"]
        },
        {
            "id": "fiscal_year_end",
            "prompt": "On what date does the company's fiscal year end? Answer with Month and Day (e.g., 'December 31').",
            # Regex: Matches "fiscal year ends December 31"
            "extract_regex": r"(?i)fiscal year end(?:ed|s)(?:\s+on)?\s+([A-Z][a-z]+ \d{1,2})",
            "keywords": ["fiscal year end", "fiscal year ends", "fiscal year ended"]
        },
        {
            "id": "company_product",
            "prompt": "What is the main product, service, or business activity of this company? Answer in 2-5 words.",
            # Regex: Matches "engaged in the business of..."
            "extract_regex": r"(?i)engaged in the (?:business|manufacture|sale|development) of ([^.;]+)",
            "keywords": ["engaged in", "business of", "manufacture", "sale of", "products"]
        }
    ],
    "section_2": [
         {
            "id": "headquarters_state",
            "prompt": "In which U.S. state are the company's principal executive offices located? Answer with ONLY the state name.",
            # Regex: Matches "executive offices ... [State] [Zip]"
            "extract_regex": r"(?i)executive offices.*?,[\s\r\n]+([A-Z][a-z]+(?: [A-Z][a-z]+)*)[\s\r\n]+\d{5}",
            "keywords": ["executive offices", "headquarters", "principal offices"]
        }
    ],
    "section_10": [
       {
            "id": "ceo_lastname",
            "prompt": "What is the Last Name of the current CEO? Answer with ONLY the last name.",
            # Regex: Matches "Mr. Smith ... CEO"
            "extract_regex": r"(?i)(?:Mr\.|Ms\.|Mrs\.|Dr\.)\s+([A-Z][a-z]+).*?(?:Chief Executive Officer|CEO)",
            "keywords": ["chief executive officer", "ceo", "serves as"]
        },
        {
            "id": "ceo_fullname_backup",
            "prompt": "What is the Full Name of the current CEO?",
             # Regex: Matches Name followed by CEO title
            "extract_regex": r"(?m)([A-Z][a-z]+ [A-Z]\.? [A-Z][a-z]+|[A-Z][a-z]+ [A-Z][a-z]+)\s*,?\s*(?:Chief Executive Officer|CEO)",
             "keywords": ["chief executive officer", "ceo"]
        }
    ]
}

In [None]:
# More Archived Questions 
QUESTION_BANK = [
    {
        "id": "registrant_name",
        "prompt": (
            "What is the exact legal name of the registrant? "
            "1. Look for the very first sentence of the 'Business' section or the cover page intro "
            "(e.g., 'Apple Inc. (the Registrant)...'). "
            "2. Do NOT use 'Doing Business As' (DBA) names or brand names. "
            "3. Do NOT include the stock ticker symbol. "
            "4. Include legal suffixes like 'Inc.', 'Corp.', 'Ltd.' if present. "
            "Answer with ONLY the legal name string."
        ),
        "sections": ["section_1", "section_2"],
    },
    {
        "id": "headquarters_city",
        "prompt": (
            "In which city are the registrant's *principal executive offices* physically located? "
            "1. Look for the address under 'Executive Offices' or 'Address of Principal Executive Offices'. "
            "2. CRITICAL WARNING: Do NOT return the city of the 'Registered Agent' or 'State of Incorporation' "
            "(e.g., ignore 'Wilmington' or 'Dover' unless the CEO actually works there). "
            "3. Ignore P.O. Boxes. "
            "Answer with ONLY the city name."
        ),
        "sections": ["section_1", "section_2"],
    },
    {
        "id": "original_incorporation_state",
        "prompt": (
            "In which U.S. state was the registrant *originally* incorporated or organized? "
            "Follow this strict hierarchy: "
            "1. PRIORITIZE HISTORY: Look for phrases like 'originally incorporated in', 'formerly organized in', "
            "or 'predecessor company incorporated in'. "
            "2. REINCORPORATION RULE: If the company reincorporated (e.g., moved from California to Delaware), "
            "you MUST return the OLD state (California), not the current one. "
            "3. MERGER EXCEPTION: Only if the registrant is a *new* successor entity formed by a merger, "
            "return the state of that successor. "
            "4. If no history is mentioned, return the current state. "
            "Answer with ONLY the state name."
        ),
        "sections": ["section_1"],
    },
    {
        "id": "original_incorporation_year",
        "prompt": (
            "In which year was the registrant *originally* incorporated or organized? "
            "1. IGNORE 'FOUNDED' dates. Only look for 'incorporated', 'organized', or 'formed'. "
            "2. REINCORPORATION RULE: If the text says 'originally incorporated in 1980' and 'reincorporated in 1995', "
            "return the EARLIEST year (1980). "
            "3. MERGER EXCEPTION: If the current entity was formed by a merger of equals, use the year of that merger. "
            "Answer with ONLY the year (YYYY)."
        ),
        "sections": ["section_1"],
    },
    {
        "id": "employee_count",
        "prompt": (
            "What is the total number of employees the registrant has? "
            "1. PREFER FULL-TIME: If the text distinguishes between full-time and part-time, return the full-time count. "
            "2. If only 'total' is given, use that. "
            "3. EXCLUDE: Do not count independent contractors, agents, or temporary staff unless they are the only number given. "
            "4. FORMAT: Remove commas and return ONLY the integer (e.g., return 14500, not 14,500). "
            "If the number is 'approximately 5,000', return 5000."
        ),
        "sections": ["section_1"],
    },
    {
        "id": "headquarters_state",
        "prompt": (
            "In which U.S. state are the registrant's *principal executive offices* physically located? "
            "1. This is the state where the HQ building is, NOT necessarily the state of incorporation. "
            "2. CRITICAL: If the text says 'Incorporated in Delaware' but 'Executive offices in California', "
            "return CALIFORNIA. "
            "Answer with ONLY the state name."
        ),
        "sections": ["section_1", "section_2"],
    },
    {
        "id": "ceo_lastname",
        "prompt": (
            "What is the LAST NAME of the registrant's current Chief Executive Officer (CEO)? "
            "1. Look for 'Chief Executive Officer', 'CEO', or 'Principal Executive Officer'. "
            "2. If 'Co-CEOs' are listed, pick the first one mentioned. "
            "3. EXCLUDE titles (Mr., Dr.) and first/middle names. "
            "4. If the CEO has a compound last name (e.g., 'Von Trap'), include the full last name. "
            "Answer with ONLY the last name string."
        ),
        "sections": ["section_1", "section_10"],
    },
    { 
        "id":"holder_record_amount",
        "prompt": (
            "What is the number of **holders of record** of the registrant's common stock? "
            "1. KEYWORDS: Look for 'holders of record', 'shareholders of record', or 'record holders' in Item 5. "
            "2. WHOLLY-OWNED RULE: If the text states the stock is 'wholly-owned', 'held solely by', or 'all outstanding stock is held by' a parent company, return **1**. "
            "3. EXCLUDE BENEFICIAL OWNERS: Do not use counts of 'beneficial owners' or shares held in 'street name' unless strictly no other number exists. "
            "4. MULTIPLE CLASSES: If Class A and Class B Common Stock are listed, SUM the record holders. Ignore Preferred Stock. "
            "5. DATE PRIORITY: If multiple dates are provided (e.g., 'as of year-end' vs 'as of March 31'), choose the **most recent** date. "
            "Return ONLY the integer (e.g., 4530). Remove commas and words like 'approximately'."
        ),
        "sections": [
            "section_5",
            "section_1",
        ],  # usually in section 5, but can be in section 1
    },
]

In [None]:

QUESTION_BANK = {
    "section_1": [
        {
            "id": "company_name",
            # PROMPT: Targets the specific filing entity
            "prompt": (
                "What is the exact legal name of the registrant? "
                "1. Look for the very first sentence of the 'Business' section or the cover page intro "
                "(e.g., 'Apple Inc. (the Registrant)...'). "
                "2. Do NOT use 'Doing Business As' (DBA) names or brand names. "
                "3. Do NOT include the stock ticker symbol. "
                "4. Include legal suffixes like 'Inc.', 'Corp.', 'Ltd.' if present. "
                "Answer with ONLY the legal name string."
            ),
            # REGEX: Finds the intro paragraph
            "regex_anchors": [
                r"([A-Z0-9][\w\s.,&'-]+?)\s*\(?(?:the\s+)?(?:Company|Registrant)\b",
                r"([A-Z0-9][\w\s.,&'-]+?),\s+a\s+\w+\s+corporation",
            ],
            "fallback_keywords": [
                "incorporated",
                "organized",
                "the company",
                "registrant",
            ],
        },
        {
            "id": "headquarters_city",
            "prompt": (
                "In which city are the registrant’s *principal executive offices* physically located? "
                "1. Look for the address under 'Executive Offices' or 'Address of Principal Executive Offices'. "
                "2. CRITICAL WARNING: Do NOT return the city of the 'Registered Agent' or 'State of Incorporation' "
                "(e.g., ignore 'Wilmington' or 'Dover' unless the CEO actually works there). "
                "3. Ignore P.O. Boxes. "
                "Answer with ONLY the city name."
            ),
            "regex_anchors": [
                r"(?i)executive\s+offices.*?(?:located|address).*?[\r\n]+.*?,?\s*([A-Z][a-z]+(?: [A-Z][a-z]+)*),?\s+[A-Z]{2}\s+\d{5}",
                r"(?i)located\s+at\s+.*?,?\s*([A-Z][a-z]+(?: [A-Z][a-z]+)*),?\s+[A-Z]{2}\s+\d{5}",
            ],
            "fallback_keywords": ["executive offices", "located at", "address"],
        },
        # --- STATE: Hybrid Logic (Successor=New, Reinc=Old) ---
        {
            "id": "original_incorporation_state",
            # PROMPT: Explicitly handles the Monsanto vs Hexcel conflict
            "prompt": (
                "In which U.S. state was the registrant *originally* incorporated or organized? "
                "Follow this strict hierarchy: "
                "1. PRIORITIZE HISTORY: Look for phrases like 'originally incorporated in', 'formerly organized in', "
                "or 'predecessor company incorporated in'. "
                "2. REINCORPORATION RULE: If the company reincorporated (e.g., moved from California to Delaware), "
                "you MUST return the OLD state (California), not the current one. "
                "3. MERGER EXCEPTION: Only if the registrant is a *new* successor entity formed by a merger, "
                "return the state of that successor. "
                "4. If no history is mentioned, return the current state. "
                "Answer with ONLY the state name."
            ),
            # REGEX: Casts a wide net to find any mention of inc, org, or predecessors
            "regex_anchors": [
                r"(?i)incorporated (?:in|under the laws of) (?:the state of )?(\w+(?:\s+\w+)?)",
                r"(?i)organized (?:in|under the laws of) (?:the state of )?(\w+(?:\s+\w+)?)",
                r"(?i)a (\w+(?:\s+\w+)?) corporation",
                r"(?i)state of incorporation[:\s]+(\w+(?:\s+\w+)?)",
                r"(?i)originally\s+(?:incorporated|organized).*?in\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
                r"(?i)predecessor.*?incorporated.*?in\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
            ],
            "fallback_keywords": [
                "incorporated",
                "organized",
                "originally",
                "predecessor",
                "laws of the state",
            ],
        },
        # --- YEAR: Hybrid Logic (Successor=New, Reinc=Old) ---
        {
            "id": "original_incorporation_year",
            # PROMPT: Explicitly handles the Monsanto vs Hexcel conflict
            "prompt": (
                "In which year was the registrant *originally* incorporated or organized? "
                "1. IGNORE 'FOUNDED' dates. Only look for 'incorporated', 'organized', or 'formed'. "
                "2. REINCORPORATION RULE: If the text says 'originally incorporated in 1980' and 'reincorporated in 1995', "
                "return the EARLIEST year (1980). "
                "3. MERGER EXCEPTION: If the current entity was formed by a merger of equals, use the year of that merger. "
                "Answer with ONLY the year (YYYY)."
            ),
            "regex_anchors": [
                r"(?i)(?:incorporated|organized|founded|established|formed) (?:in |on |)(?:\w+ )?(18\d{2}|19\d{2}|20\d{2})",
                r"(?i)originally\s+incorporated.*?(18\d{2}|19\d{2}|20\d{2})",
            ],
            "fallback_keywords": [
                "founded",
                "incorporated",
                "organized",
                "year",
                "originally",
            ],
        },
        # --- EMPLOYEES: Exclusion Logic ---
        {
            "id": "employee_count",
            "prompt": (
                "What is the total number of employees the registrant has? "
                "1. PREFER FULL-TIME: If the text distinguishes between full-time and part-time, return the full-time count. "
                "2. If only 'total' is given, use that. "
                "3. EXCLUDE: Do not count independent contractors, agents, or temporary staff unless they are the only number given. "
                "4. FORMAT: Remove commas and return ONLY the integer (e.g., return 14500, not 14,500). "
                "If the number is 'approximately 5,000', return 5000."
            ),
            "regex_anchors": [
                r"(?i)(?:had|employ).*?([0-9,]+)\s+(?:full-time)?\s+employees",
                r"(?i)(?:had|employ(?:ed|s)?|totaling)\s+(?:approximately|over|roughly|about\s+)?([0-9,]+)\s+(?:full-time|total)?\s+employees",
                r"(?i)([0-9,]+)\s+(?:full-time\s+)?(?:people|persons|employees)\s+(?:were|are)\s+employed",
            ],
            "fallback_keywords": ["employees", "full-time"],
        },
        {
            "id": "headquarters_state",
            "prompt": (
                "In which U.S. state are the registrant’s *principal executive offices* physically located? "
                "1. This is the state where the HQ building is, NOT necessarily the state of incorporation. "
                "2. CRITICAL: If the text says 'Incorporated in Delaware' but 'Executive offices in California', "
                "return CALIFORNIA. "
                "Answer with ONLY the state name."
            ),
            "regex_anchors": [
                r"(?i)(?:headquarters|principal (?:executive )?offices?|corporate offices?) (?:is |are |)(?:located |)in ([^,\.\n]+)",
                r"(?i)executive offices.*?,[\s\r\n]+([A-Z][a-z]+(?: [A-Z][a-z]+)*)[\s\r\n]+\d{5}",
            ],
            "fallback_keywords": [
                "executive offices",
                "headquarters",
                "principal offices",
            ],
        },
    ],
    "section_10": [
        {
            "id": "ceo_lastname",
            "What is the LAST NAME of the registrant’s current Chief Executive Officer (CEO)? "
                "1. Look for 'Chief Executive Officer', 'CEO', or 'Principal Executive Officer'. "
                "2. If 'Co-CEOs' are listed, pick the first one mentioned. "
                "3. EXCLUDE titles (Mr., Dr.) and first/middle names. "
                "4. If the CEO has a compound last name (e.g., 'Von Trap'), include the full last name. "
                "Answer with ONLY the last name string."
            ),
            "regex_anchors": [
                r"(?i)([A-Z][a-z]+ [A-Z][a-z]+)[,\s]+(?:is |serves as |)(?:the |our |)(?:Chief Executive Officer|CEO)",
                r"(?i)(?:Chief Executive Officer|CEO)[:\s]+([A-Z][a-z]+ [A-Z][a-z]+)",
            ],
            "fallback_keywords": ["chief executive officer", "ceo", "serves as"],
        }
    ],
}