In [1]:
# Config
# ============================================================================
# Model Performance Notes:
# gemini-2.0-flash -> 1 minute, 122 extractions -> 13 COMMITTED | 108 CANDIDATE_ONLY | 1 REVIEW
# gemini-2.5-flash -> 7:22 minutes, 528 extractions -> 252 COMMITTED | 267 CANDIDATE_ONLY | 9 REVIEW
# ============================================================================

model_name = "gemini-2.0-flash"  # gemini-2.5-flash, gemini-2.5-pro, gemini-2.0-flash
suppress_parse_errors_value = False

# 8-K Configuration - EITHER set accession_no OR sample_file_path
ACCESSION_NO = "0001571996-25-000096"  # Set this to fetch from Neo4j (preferred for live)
SAMPLE_FILE_PATH = None  # Or set this for manual file loading

# Catalog Configuration
INCLUDE_AMENDMENTS = False  # Include 10-K/A, 10-Q/A amendments

In [2]:
# Setup: Add paths and imports
import sys
import os

# Ensure imports resolve
sys.path.insert(0, os.getcwd())

# Core imports
from xbrl_catalog import get_xbrl_catalog, print_catalog_summary, get_neo4j_driver
from postprocessor import postprocess
from extraction_schema import ExtractionStatus, RawExtraction, UNMATCHED

print(f"Working directory: {os.getcwd()}")

Working directory: /home/faisal/EventMarketDB/drivers/8K_XBRL_Linking/FinalScripts


## Status Determination

Determines extraction status based on validation results:
- **COMMITTED**: confidence ≥ 0.90 + valid qname + valid unit + valid period + value parsed
- **CANDIDATE_ONLY**: Valid but low confidence or UNMATCHED concept  
- **REVIEW**: Parse failure, invalid period, or invalid unit

---
## Catalog Fetch (Neo4j)

Fetches XBRL catalog from Neo4j (READ ONLY - no writes). Contains concepts, units, dimensions, and historical facts for context.

In [3]:
# Fetch 8-K metadata and XBRL catalog from Neo4j
# ============================================================================
# This cell:
# 1. Gets 8-K Report metadata (CIK, created datetime, text content)
# 2. Uses the 8-K's created datetime as as_of_dt for temporal filtering
# 3. Fetches XBRL catalog with only filings BEFORE the 8-K was filed
# ============================================================================

driver = get_neo4j_driver()

with driver.session() as session:
    # Fetch 8-K report metadata
    result = session.run("""
        MATCH (r:Report {accessionNo: $accession_no})
        MATCH (r)-[:PRIMARY_FILER]->(c:Company)
        RETURN r.accessionNo as accession_no,
               r.formType as form_type,
               r.created as created,
               r.periodOfReport as period_of_report,
               r.exhibit_contents as exhibit_contents,
               c.cik as cik,
               c.ticker as ticker,
               c.name as company_name
    """, accession_no=ACCESSION_NO)
    
    report = result.single()
    
    if not report:
        raise ValueError(f"Report {ACCESSION_NO} not found in Neo4j")

# Extract 8-K metadata
report_cik = report["cik"]
report_ticker = report["ticker"]
report_created = report["created"]  # This is our as_of_dt!
report_exhibits = report["exhibit_contents"]

print(f"8-K Report: {ACCESSION_NO}")
print(f"Company: {report['company_name']} ({report_ticker})")
print(f"CIK: {report_cik}")
print(f"Filed: {report_created}")
print(f"Period: {report['period_of_report']}")

# Handle exhibit_contents - can be string or dict
if isinstance(report_exhibits, str):
    # It's already the 8-K text content
    sample_8k_text = report_exhibits
    print(f"8-K text: {len(sample_8k_text):,} characters")
elif isinstance(report_exhibits, dict):
    # It's a dict of exhibit_name -> content
    sample_8k_text = "\n\n".join([
        f"=== {name} ===\n{content}" 
        for name, content in report_exhibits.items()
    ])
    print(f"8-K text: {len(sample_8k_text):,} characters from {len(report_exhibits)} exhibits")
else:
    sample_8k_text = None
    print("Warning: No exhibit_contents found. Use SAMPLE_FILE_PATH instead.")

# Fetch XBRL catalog with as_of_dt = 8-K's created datetime
# This ensures we only get XBRL from filings BEFORE the 8-K was filed
print(f"\nFetching XBRL catalog with as_of_dt={report_created}...")

catalog = get_xbrl_catalog(
    cik=report_cik,
    as_of_dt=report_created,
    include_amendments=INCLUDE_AMENDMENTS,
    include_relationships=True,
    driver=driver
)

print_catalog_summary(catalog)

8-K Report: 0001571996-25-000096
Company: DELL TECHNOLOGIES INC (DELL)
CIK: 0001571996
Filed: 2025-08-28T16:09:59-04:00
Period: 2025-08-28
8-K text: 29,249 characters

Fetching XBRL catalog with as_of_dt=2025-08-28T16:09:59-04:00...

XBRL Catalog: DELL TECHNOLOGIES INC (DELL)
CIK: 0001571996
Industry: ComputerHardware
Sector: Technology

Total Filings: 10
Total Facts: 16,517
Unique Concepts: 814

Filings:
  - 10-Q (2025-05-02): 1,160 facts
  - 10-K (2025-01-31): 2,185 facts
  - 10-Q (2024-11-01): 1,565 facts
  - 10-Q (2024-08-02): 1,541 facts
  - 10-Q (2024-05-03): 1,194 facts
  - 10-K (2024-02-02): 2,044 facts
  - 10-Q (2023-11-03): 1,728 facts
  - 10-Q (2023-08-04): 1,673 facts
  - 10-Q (2023-05-05): 1,316 facts
  - 10-K (2023-02-03): 2,111 facts

Top Segments:
  - FinanceLeasesPortfolioSegment: 811 facts
  - LoansAndFinanceReceivables: 707 facts
  - Nondesignated: 590 facts
  - UnsecuredDebt: 576 facts
  - ForeignExchangeContract: 528 facts




In [4]:
# Inspect catalog data
if 'catalog' in dir():
    print(f"Valid Qnames ({len(catalog.concepts)} total, first 20):")
    for qname in list(catalog.concepts.keys())[:20]:
        print(f"  {qname}")
    
    print(f"\nValid Units ({len(catalog.units)} total):")
    for unit in catalog.units.keys():
        print(f"  {unit}")

Valid Qnames (814 total, first 20):
  us-gaap:Revenues
  us-gaap:StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest
  us-gaap:NotesReceivableGross
  us-gaap:DebtInstrumentCarryingAmount
  us-gaap:DerivativeFairValueOfDerivativeLiability
  us-gaap:DerivativeFairValueOfDerivativeAsset
  us-gaap:CostOfRevenue
  us-gaap:ProfitLoss
  us-gaap:DerivativeAssetsLiabilitiesAtFairValueNet
  us-gaap:OperatingIncomeLoss
  us-gaap:DebtInstrumentInterestRateStatedPercentage
  us-gaap:NotesReceivableNet
  us-gaap:LongTermDebt
  us-gaap:CommonStockSharesIssued
  us-gaap:FinancingReceivableAllowanceForCreditLosses
  us-gaap:DividendsCommonStock
  us-gaap:FinancingReceivableRevolving
  us-gaap:AdjustmentsToAdditionalPaidInCapitalSharebasedCompensationRequisiteServicePeriodRecognitionValue
  us-gaap:SeveranceCosts1
  us-gaap:OperatingExpenses

Valid Units (12 total):
  iso4217:USD
  shares
  pure
  iso4217:USDshares
  dell:vote
  iso4217:EUR
  dell:facility
  dell:segment
  dell:tranch

In [5]:
# Preview and export catalog
# ============================================================================
# View LLM context (text format) and optionally export to HTML
# ============================================================================

if 'catalog' in dir():
    # Generate LLM context - explicit parameters for easy tuning
    # -------------------------------------------------------------------------
    # max_concepts_with_history: None = ALL | Try 50-100 for smaller context
    # max_history_per_concept:   None = ALL | Try 4 for recent quarters only
    # max_sample_facts:          None = ALL | (currently disabled)
    # include_relationships:     True = include calc/presentation networks | False = smaller
    # context_budget_chars:      None = no limit | Try 100000 for ~100k char limit
    # -------------------------------------------------------------------------
    context = catalog.to_llm_context(
        max_concepts_with_history=None,   # None = ALL concepts with history
        max_history_per_concept=None,     # None = ALL history values per concept
        max_sample_facts=None,            # None = ALL (section currently disabled)
        include_relationships=True,       # True = include calc/presentation networks
        context_budget_chars=None         # None = no limit on output size
    )
    print(f"LLM Context ({len(context):,} chars):\n")
    print(context[:3000])
    print("\n... [truncated] ...")
    
    # Export to HTML for full catalog review
    html_output = f"/home/faisal/EventMarketDB/drivers/8K_XBRL_Linking/output/{report_ticker}_catalog.html"
    catalog.to_html(output_path=html_output)
    print(f"\nOpen in browser: file://{html_output}")

LLM Context (307,045 chars):

<<<BEGIN_XBRL_REFERENCE_DATA>>>
COMPANY: DELL TECHNOLOGIES INC (DELL)
CIK: 0001571996 | Industry: ComputerHardware | Sector: Technology

LEGEND:
• qname = unique concept identifier (e.g., us-gaap:Revenues)
• label = human-readable name

────────────────────────────────────────────────────────────────────────
FISCAL CALENDAR
────────────────────────────────────────────────────────────────────────
Fiscal Year End: January 31
Quarter Mapping (fiscal → calendar end month):
  FY2026: Q1→Apr25 | Q2→Jul25 | Q3→Oct25 | Q4→Jan26
  FY2025: Q1→Apr24 | Q2→Jul24 | Q3→Oct24 | Q4→Jan25

────────────────────────────────────────────────────────────────────────
CONCEPTS (433 numeric, top 433 shown)
History shows recent values for magnitude validation (10-K=annual, 10-Q=quarterly)
────────────────────────────────────────────────────────────────────────
── TOP CONCEPTS (with history for magnitude validation) ──
us-gaap:NotesReceivableGross | Financing Receivable, before Allow

In [6]:
# Alternative: Load 8-K from file (if not using Neo4j fetch above)
# ============================================================================
# Use this if you have a local file instead of fetching from Neo4j
# ============================================================================

if SAMPLE_FILE_PATH and not report_exhibits:
    print(f"Loading 8-K from file: {SAMPLE_FILE_PATH}")
    with open(SAMPLE_FILE_PATH, 'r') as f:
        sample_8k_text = f.read()
    print(f"Loaded 8-K: {len(sample_8k_text):,} characters")
    
    # Note: When loading from file, as_of_dt was already set from Neo4j query above
    # If you're not using Neo4j at all, you'd need to set as_of_dt manually

# Build catalog context for LLM - explicit parameters for easy tuning
# -------------------------------------------------------------------------
# max_concepts_with_history: None = ALL | Try 50-100 for smaller context
# max_history_per_concept:   None = ALL | Try 4 for recent quarters only
# max_sample_facts:          None = ALL | (currently disabled)
# include_relationships:     True = include calc/presentation networks | False = smaller
# context_budget_chars:      None = no limit | Try 100000 for ~100k char limit
# -------------------------------------------------------------------------
llm_context = catalog.to_llm_context(
    max_concepts_with_history=None,   # None = ALL concepts with history
    max_history_per_concept=None,     # None = ALL history values per concept
    max_sample_facts=None,            # None = ALL (section currently disabled)
    include_relationships=True,       # True = include calc/presentation networks
    context_budget_chars=100000       # None = no limit on output size
)
print(f"Catalog context: {len(llm_context):,} characters")
print(f"8-K text: {len(sample_8k_text):,} characters")
print(f"Total context: {len(llm_context) + len(sample_8k_text):,} characters")

Catalog context: 99,946 characters
8-K text: 29,249 characters
Total context: 129,195 characters


In [7]:
# Alternative: Load 8-K from file (if not using Neo4j fetch above)
# ============================================================================
# Use this if you have a local file instead of fetching from Neo4j
# ============================================================================

if SAMPLE_FILE_PATH and not report_exhibits:
    print(f"Loading 8-K from file: {SAMPLE_FILE_PATH}")
    with open(SAMPLE_FILE_PATH, 'r') as f:
        sample_8k_text = f.read()
    print(f"Loaded 8-K: {len(sample_8k_text):,} characters")
    
    # Note: When loading from file, as_of_dt was already set from Neo4j query above
    # If you're not using Neo4j at all, you'd need to set as_of_dt manually

# Build catalog context for LLM (max_concepts_with_history=None shows ALL concepts)
llm_context = catalog.to_llm_context(max_concepts_with_history=None)
print(f"Catalog context: {len(llm_context):,} characters")
print(f"8-K text: {len(sample_8k_text):,} characters")
print(f"Total context: {len(llm_context) + len(sample_8k_text):,} characters")

Catalog context: 307,045 characters
8-K text: 29,249 characters
Total context: 336,294 characters


In [8]:
# Initialize LangExtract
import langextract as lx
from extraction_config import PROMPT_DESCRIPTION, EXAMPLES


print(f"Prompt length: {len(PROMPT_DESCRIPTION)} chars")
print(f"Examples: {len(EXAMPLES)}")

Prompt length: 2539 chars
Examples: 6


In [9]:
# Run extraction - THIS IS THE REAL LANGEXTRACT CALL
print("Running LangExtract extraction...")
print(f"Model: {model_name}")


annotated_doc = lx.extract(
    text_or_documents=sample_8k_text,       # Just the 8-K document
    prompt_description=PROMPT_DESCRIPTION,
    examples=EXAMPLES,
    model_id= model_name,
    additional_context=llm_context,          # Catalog added to EVERY chunk
    resolver_params={"suppress_parse_errors": suppress_parse_errors_value}
)

# Get extractions from annotated document
raw_extractions = annotated_doc.extractions

print(f"✓ LangExtract returned {len(raw_extractions)} extractions")

Running LangExtract extraction...
Model: gemini-2.0-flash


  model = _create_model_with_schema(
[94m[1mLangExtract[0m: model=[92mgemini-2.0-flash[0m, current=[92m8,334[0m chars, processed=[92m0[0m chars:  [00:28]


ResolverParsingError: Failed to parse JSON content: Unterminated string starting at: line 52 column 25 (char 1986)

In [None]:
# Inspect RAW LangExtract output (before postprocessing)
print(f"RAW LANGEXTRACT OUTPUT ({len(raw_extractions)} extractions)\n" + "="*70)

for i, ext in enumerate(raw_extractions, 1):
    print(f"\n[{i}] Raw Extraction")
    print(f"    class: {ext.extraction_class}")
    print(f"    text: \"{ext.extraction_text[:80]}...\"" if len(ext.extraction_text) > 80 else f"    text: \"{ext.extraction_text}\"")
    
    # Handle char_interval (can be None)
    if ext.char_interval:
        print(f"    span: {ext.char_interval.start_pos} - {ext.char_interval.end_pos}")
    else:
        print(f"    span: N/A")
    
    if ext.alignment_status:
        print(f"    alignment: {ext.alignment_status.value}")
    
    attrs = ext.attributes or {}
    if attrs:
        print(f"    --- Attributes ---")
        for k, v in attrs.items():
            if v is not None:
                print(f"    {k}: {v}")

---
## 9. Postprocess Real Extractions

Runs the postprocessor on actual LangExtract output:
1. Filters extractions pointing into catalog context (precision fix)
2. Validates qnames, units, periods
3. Parses numeric values deterministically
4. Assigns status: COMMITTED / CANDIDATE_ONLY / REVIEW

In [None]:
# Map raw extractions to RawExtraction dataclass, then postprocess
from extraction_schema import RawExtraction, ExtractionStatus, UNMATCHED
from postprocessor import postprocess

def safe_float(value, default=0.0):
    """Safely convert to float, return default if conversion fails."""
    if value is None:
        return default
    try:
        return float(value)
    except (ValueError, TypeError):
        print(f"  Warning: Could not parse confidence '{value}', using {default}")
        return default

# Get valid qnames and units from catalog
valid_qnames = set(catalog.concepts.keys())
valid_units = {"USD", "USD/share", "shares", "pure"}  # Standard units

# Filter extractions that point into catalog context (precision fix)
source_text_length = len(sample_8k_text)

# Map to RawExtraction
mapped_extractions = []
for ext in raw_extractions:
    # Only process financial_fact extractions
    if ext.extraction_class != 'financial_fact':
        continue
    
    # Get char positions from char_interval
    char_start = ext.char_interval.start_pos if ext.char_interval else 0
    char_end = ext.char_interval.end_pos if ext.char_interval else 0
    
    # Skip extractions pointing into catalog context (beyond source text)
    if char_end > source_text_length:
        print(f"Dropping extraction at char_end={char_end} (beyond source text at {source_text_length})")
        continue
    
    attrs = ext.attributes or {}
    
    mapped_extractions.append(RawExtraction(
        extraction_text=ext.extraction_text,
        char_start=char_start,
        char_end=char_end,
        concept_top1=attrs.get('concept_top1') or UNMATCHED,
        matched_period=attrs.get('matched_period') or '',
        matched_unit=attrs.get('matched_unit') or '',
        confidence=safe_float(attrs.get('confidence'), 0.0),
        reasoning=attrs.get('reasoning') or '',
        concept_top2=attrs.get('concept_top2'),
        matched_dimension=attrs.get('matched_dimension'),
        matched_member=attrs.get('matched_member')
    ))

print(f"Mapped {len(mapped_extractions)} financial_fact extractions for postprocessing")

In [None]:
# Run postprocessor
from postprocessor import validate_period

processed_facts = postprocess(mapped_extractions, valid_qnames, valid_units)

# Display results
print(f"POSTPROCESSED FACTS ({len(processed_facts)} total)\n" + "="*70)

for i, fact in enumerate(processed_facts, 1):
    status_icon = "✓" if fact.status == ExtractionStatus.COMMITTED else "○" if fact.status == ExtractionStatus.CANDIDATE_ONLY else "✗"
    print(f"\n[{i}] {status_icon} {fact.status.value}")
    print(f"    Text: \"{fact.extraction_text[:60]}...\"")
    print(f"    Concept: {fact.concept_top1}")
    if fact.concept_top2:
        print(f"    Concept2: {fact.concept_top2}")
    print(f"    Value: {fact.value_parsed:,.2f}" if fact.value_parsed else f"    Value: None")
    print(f"    Period: {fact.matched_period}")
    print(f"    Unit: {fact.matched_unit} (valid={fact.unit_valid})")
    print(f"    Confidence: {fact.confidence:.2f}")
    if fact.parse_error:
        print(f"    Parse Error: {fact.parse_error}")

# Summary
committed = sum(1 for f in processed_facts if f.status == ExtractionStatus.COMMITTED)
candidate = sum(1 for f in processed_facts if f.status == ExtractionStatus.CANDIDATE_ONLY)
review = sum(1 for f in processed_facts if f.status == ExtractionStatus.REVIEW)
print(f"\n" + "="*70)
print(f"SUMMARY: {committed} COMMITTED | {candidate} CANDIDATE_ONLY | {review} REVIEW")

---
## 10. Save Extraction Results

Saves extractions in two formats:
- **JSONL**: Raw machine-readable format with all extraction data
- **HTML**: Interactive visualization with entity highlighting in source context

In [None]:
# Output directory
import os
from datetime import datetime

OUTPUT_DIR = "/home/faisal/EventMarketDB/drivers/8K_XBRL_Linking/output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Generate timestamp for filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_name = f"DELL_8K_{timestamp}"

print(f"Output directory: {OUTPUT_DIR}")
print(f"Base filename: {base_name}")

In [None]:
# Save annotated document to JSONL using LangExtract's built-in function
jsonl_path = os.path.join(OUTPUT_DIR, f"{base_name}.jsonl")

# save_annotated_documents expects an iterator of AnnotatedDocument
lx.io.save_annotated_documents(
    annotated_documents=[annotated_doc],  # List of AnnotatedDocument
    output_dir=OUTPUT_DIR,
    output_name=f"{base_name}.jsonl"
)

print(f"✓ Saved annotated document to: {jsonl_path}")

In [None]:
# Generate interactive HTML visualization
html_path = os.path.join(OUTPUT_DIR, f"{base_name}.html")

# Use LangExtract's visualize - can take AnnotatedDocument directly
html_content = lx.visualize(annotated_doc)

# Save HTML file
with open(html_path, "w") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)
    else:
        f.write(str(html_content))

print(f"✓ Saved HTML visualization to: {html_path}")
print(f"\nOpen in browser to view extractions highlighted in source text.")

In [None]:
stop