# XBRL Company Data Explorer
Fetch all unique XBRL Concepts, Members, and Domains for a particular company

## 1. Setup and Imports

In [1]:
import pandas as pd
from neo4j import GraphDatabase
import os

In [2]:
def format_period_label(form_type, report_date, fiscal_year_end_month):
    """Convert report date to fiscal period label using company's fiscal year"""
    if form_type == '10-K':
        return f"FY{report_date[:4]}"
    elif form_type == '10-Q':
        month = int(report_date[5:7])
        year = int(report_date[:4])
        fy_end = int(fiscal_year_end_month)
        
        # Calculate fiscal quarter based on company's fiscal year
        if fy_end == 12:  # Calendar year
            quarters = {(1,2,3): 'Q1', (4,5,6): 'Q2', (7,8,9): 'Q3', (10,11,12): 'Q4'}
        elif fy_end == 9:  # Like Apple (Sept year-end)
            quarters = {(10,11,12): 'Q1', (1,2,3): 'Q2', (4,5,6): 'Q3', (7,8,9): 'Q4'}
            if month >= 10:
                year += 1
        elif fy_end == 3:  # Like Amazon (March year-end)
            quarters = {(4,5,6): 'Q1', (7,8,9): 'Q2', (10,11,12): 'Q3', (1,2,3): 'Q4'}
            if month <= 3:
                year += 1
        elif fy_end == 5:  # Like ConAgra (May year-end)
            quarters = {(6,7,8): 'Q1', (9,10,11): 'Q2', (12,1,2): 'Q3', (3,4,5): 'Q4'}
            if month >= 6:
                year += 1
        else:  # Generic quarterly mapping
            q = ((month - fy_end - 1) % 12) // 3 + 1
            if month > fy_end:
                year += 1
            return f"Q{q} FY{year}"
            
        for months, quarter in quarters.items():
            if month in months:
                return f"{quarter} FY{year}"
    
    return f"{form_type}-{report_date[:7]}"

## 2. Neo4j Connection

In [3]:
# Neo4j connection parameters
uri = "bolt://localhost:30687"
username = "neo4j"
password = os.getenv('NEO4J_PASSWORD', 'your_password')

driver = GraphDatabase.driver(uri, auth=(username, password))

## 3. Function: Get All Unique Concepts with Values

In [4]:
def get_company_concepts(ticker, form_types=['10-K', '10-Q']):
    """
    Enhanced: Fetch concepts with value tuples including context
    Also returns backward-compatible columns for existing cells
    """
    # First get company fiscal year info
    company_query = "MATCH (c:Company {ticker: $ticker}) RETURN c.fiscal_year_end_month as fy_month"
    
    with driver.session() as session:
        company_result = session.run(company_query, ticker=ticker)
        fy_month = list(company_result)[0]['fy_month'] or '12'
    
    query = """
    MATCH (c:Company {ticker: $ticker})-[:PRIMARY_FILER]-(r:Report)-[:HAS_XBRL]-(x:XBRLNode)<-[:REPORTS]-(f:Fact)-[:HAS_CONCEPT]-(concept:Concept)
    WHERE r.formType IN $form_types AND f.value IS NOT NULL AND f.value <> 'null' AND f.is_nil <> '1'
    OPTIONAL MATCH (f)-[:HAS_PERIOD]-(p:Period)
    OPTIONAL MATCH (f)-[:HAS_UNIT]-(u:Unit)
    OPTIONAL MATCH (f)-[:FACT_MEMBER]-(m:Member)
    OPTIONAL MATCH (f)-[:IN_CONTEXT]-(ctx:Context)
    WITH concept,
         f,
         r,
         p.period_type as period_type,
         p.start_date as start_date,
         p.end_date as end_date,
         u.name as unit,
         ctx.context_id as context,
         collect(DISTINCT m.label) as members
    WITH concept,
         count(DISTINCT f) as fact_count,
         count(DISTINCT r) as report_count,
         collect(DISTINCT r.formType) as forms_used,
         collect(DISTINCT {
             value: f.value,
             form: r.formType,
             report_date: r.periodOfReport,
             period_type: period_type,
             period_start: start_date,
             period_end: end_date,
             unit: unit,
             context: context,
             members: members,
             is_numeric: f.is_numeric
         }) as raw_values
    RETURN DISTINCT
        concept.qname as qname,
        concept.label as label,
        concept.namespace as namespace,
        concept.balance as balance_type,
        concept.period_type as period_type,  // For backward compatibility
        concept.period_type as concept_period_type,
        concept.category as category,
        concept.concept_type as concept_type,
        concept.u_id as unique_id,
        fact_count,
        report_count,
        forms_used,
        size(raw_values) as unique_value_count,
        raw_values as value_tuples
    ORDER BY unique_value_count DESC, concept.namespace, concept.qname
    """
    
    with driver.session() as session:
        result = session.run(query, ticker=ticker, form_types=form_types)
        df = pd.DataFrame([dict(record) for record in result])
        
        # Format tuples with period labels and context
        if not df.empty and 'value_tuples' in df.columns:
            df['value_tuples'] = df['value_tuples'].apply(
                lambda tuples: [
                    (
                        t['value'],
                        t['form'],
                        format_period_label(t['form'], t['report_date'], fy_month),
                        t['unit'] or 'N/A',
                        t['context'],
                        ', '.join(t['members']) if t['members'] else ''
                    )
                    for t in sorted(tuples, key=lambda x: x['report_date'])
                ] if tuples else []
            )
        
        return df

## 4. Function: Get All Unique Members

In [5]:
def get_company_members(ticker, form_types=['10-K', '10-Q']):
    """
    Fetch all unique XBRL members for a company's reports
    """
    query = """
    MATCH (c:Company {ticker: $ticker})-[:PRIMARY_FILER]-(r:Report)-[:HAS_XBRL]-(x:XBRLNode)<-[:REPORTS]-(f:Fact)-[:FACT_MEMBER]-(m:Member)
    WHERE r.formType IN $form_types
    WITH m,
         count(DISTINCT f) as fact_count,
         count(DISTINCT r) as report_count
    RETURN DISTINCT
        m.qname as qname,
        m.label as label,
        m.level as level,
        m.parent_qname as parent_qname,
        m.u_id as unique_id,
        fact_count,
        report_count
    ORDER BY fact_count DESC, m.qname
    """
    
    with driver.session() as session:
        result = session.run(query, ticker=ticker, form_types=form_types)
        return pd.DataFrame([dict(record) for record in result])

## 5. Function: Get All Unique Domains

In [6]:
def get_company_domains(ticker, form_types=['10-K', '10-Q']):
    """
    Fetch all unique XBRL domains for a company's reports
    """
    query = """
    MATCH (c:Company {ticker: $ticker})-[:PRIMARY_FILER]-(r:Report)-[:HAS_XBRL]-(x:XBRLNode)<-[:REPORTS]-(f:Fact)-[:FACT_MEMBER]-(m:Member)
    WHERE r.formType IN $form_types
    WITH m
    MATCH (dom:Domain)-[:HAS_MEMBER]-(m)
    WITH dom,
         count(DISTINCT m) as member_count
    RETURN DISTINCT
        dom.qname as qname,
        dom.label as label,
        dom.level as level,
        dom.parent_qname as parent_qname,
        dom.u_id as unique_id,
        member_count
    ORDER BY member_count DESC, dom.qname
    """
    
    with driver.session() as session:
        result = session.run(query, ticker=ticker, form_types=form_types)
        return pd.DataFrame([dict(record) for record in result])

## 6. Set Company Ticker

In [7]:
# Set the company ticker
TICKER = 'AAPL'  # Change this to any company ticker
FORM_TYPES = ['10-K', '10-Q']  # Can also use ['10-K'] or ['10-Q'] separately

## 7. Fetch All Unique Concepts

In [8]:
# Get all concepts
concepts_df = get_company_concepts(TICKER, FORM_TYPES)
print(f"Total unique concepts for {TICKER}: {len(concepts_df)}")
print(f"\nNamespaces found: {concepts_df['namespace'].nunique()}")

Total unique concepts for AAPL: 1123

Namespaces found: 19


In [9]:
# Display first few concepts
concepts_df.head(10)

Unnamed: 0,qname,label,namespace,balance_type,period_type,concept_period_type,category,concept_type,unique_id,fact_count,report_count,forms_used,unique_value_count,value_tuples
0,us-gaap:RevenueFromContractWithCustomerExcludi...,Revenues,http://fasb.org/us-gaap/2023,credit,duration,duration,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:RevenueFr...,213,5,"[10-K, 10-Q]",213,"[(156,778,000,000, 10-Q, Q4 FY2023, iso4217:US..."
1,us-gaap:RevenueFromContractWithCustomerExcludi...,"Revenue from Contract with Customer, Excluding...",http://fasb.org/us-gaap/2024,credit,duration,duration,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:RevenueFr...,165,4,"[10-K, 10-Q]",165,"[(66,952,000,000, 10-K, FY2024, iso4217:USD, c..."
2,us-gaap:CashAndCashEquivalentsAtCarryingValue,"Cash and Cash Equivalents, at Carrying Value",http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:CashAndCa...,140,5,"[10-K, 10-Q]",140,"[(2,929,000,000, 10-Q, Q4 FY2023, iso4217:USD,..."
3,us-gaap:MarketableSecuritiesCurrent,Marketable securities,http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Marketabl...,140,5,"[10-K, 10-Q]",140,"[(355,000,000, 10-Q, Q4 FY2023, iso4217:USD, c..."
4,us-gaap:MarketableSecuritiesNoncurrent,"Marketable Securities, Noncurrent",http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Marketabl...,140,5,"[10-K, 10-Q]",140,"[(120,805,000,000, 10-Q, Q4 FY2023, iso4217:US..."
5,us-gaap:OperatingIncomeLoss,Decrease to loss from operations,http://fasb.org/us-gaap/2023,credit,duration,duration,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Operating...,119,5,"[10-K, 10-Q]",119,"[(22,998,000,000, 10-Q, Q4 FY2023, iso4217:USD..."
6,us-gaap:CashAndCashEquivalentsAtCarryingValue,"Cash and Cash Equivalents, at Carrying Value",http://fasb.org/us-gaap/2024,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:CashAndCa...,112,4,"[10-K, 10-Q]",112,"[(0, 10-K, FY2024, iso4217:USD, c-65, FairValu..."
7,us-gaap:MarketableSecuritiesCurrent,"Marketable Securities, Current",http://fasb.org/us-gaap/2024,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:Marketabl...,106,4,"[10-K, 10-Q]",106,"[(271,000,000, 10-K, FY2024, iso4217:USD, c-81..."
8,us-gaap:MarketableSecuritiesNoncurrent,"Marketable Securities, Noncurrent",http://fasb.org/us-gaap/2024,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:Marketabl...,106,4,"[10-K, 10-Q]",106,"[(100,544,000,000, 10-K, FY2024, iso4217:USD, ..."
9,us-gaap:StockholdersEquity,Stockholders' Equity Attributable to Parent,http://fasb.org/us-gaap/2023,credit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Stockhold...,104,5,"[10-K, 10-Q]",104,"[(63,090,000,000, 10-Q, Q4 FY2023, iso4217:USD..."


In [10]:
concepts_df.namespace.unique()

array(['http://fasb.org/us-gaap/2023', 'http://fasb.org/us-gaap/2024',
       'http://fasb.org/us-gaap/2022', 'http://xbrl.sec.gov/dei/2023',
       'http://xbrl.sec.gov/dei/2024', 'http://xbrl.sec.gov/dei/2022',
       'http://xbrl.sec.gov/ecd/2023', 'http://www.apple.com/20221231',
       'http://www.apple.com/20230401', 'http://www.apple.com/20230701',
       'http://www.apple.com/20230930', 'http://www.apple.com/20231230',
       'http://www.apple.com/20240330', 'http://www.apple.com/20240629',
       'http://www.apple.com/20240928', 'http://www.apple.com/20241228',
       'http://www.apple.com/20250329', 'http://www.apple.com/20250628',
       'http://xbrl.sec.gov/ecd/2024'], dtype=object)

## 8. Filter Concepts Examples

In [11]:
# Filter by US-GAAP concepts only
usgaap_concepts = concepts_df[concepts_df['namespace'].str.contains('fasb.org/us-gaap')]
print(f"US-GAAP concepts: {len(usgaap_concepts)}")
usgaap_concepts.head()

US-GAAP concepts: 785


Unnamed: 0,qname,label,namespace,balance_type,period_type,concept_period_type,category,concept_type,unique_id,fact_count,report_count,forms_used,unique_value_count,value_tuples
0,us-gaap:RevenueFromContractWithCustomerExcludi...,Revenues,http://fasb.org/us-gaap/2023,credit,duration,duration,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:RevenueFr...,213,5,"[10-K, 10-Q]",213,"[(156,778,000,000, 10-Q, Q4 FY2023, iso4217:US..."
1,us-gaap:RevenueFromContractWithCustomerExcludi...,"Revenue from Contract with Customer, Excluding...",http://fasb.org/us-gaap/2024,credit,duration,duration,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:RevenueFr...,165,4,"[10-K, 10-Q]",165,"[(66,952,000,000, 10-K, FY2024, iso4217:USD, c..."
2,us-gaap:CashAndCashEquivalentsAtCarryingValue,"Cash and Cash Equivalents, at Carrying Value",http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:CashAndCa...,140,5,"[10-K, 10-Q]",140,"[(2,929,000,000, 10-Q, Q4 FY2023, iso4217:USD,..."
3,us-gaap:MarketableSecuritiesCurrent,Marketable securities,http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Marketabl...,140,5,"[10-K, 10-Q]",140,"[(355,000,000, 10-Q, Q4 FY2023, iso4217:USD, c..."
4,us-gaap:MarketableSecuritiesNoncurrent,"Marketable Securities, Noncurrent",http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Marketabl...,140,5,"[10-K, 10-Q]",140,"[(120,805,000,000, 10-Q, Q4 FY2023, iso4217:US..."


In [12]:
# Filter by balance type
debit_concepts = concepts_df[concepts_df['balance_type'] == 'debit']
print(f"Debit balance concepts: {len(debit_concepts)}")
debit_concepts.head()

Debit balance concepts: 327


Unnamed: 0,qname,label,namespace,balance_type,period_type,concept_period_type,category,concept_type,unique_id,fact_count,report_count,forms_used,unique_value_count,value_tuples
2,us-gaap:CashAndCashEquivalentsAtCarryingValue,"Cash and Cash Equivalents, at Carrying Value",http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:CashAndCa...,140,5,"[10-K, 10-Q]",140,"[(2,929,000,000, 10-Q, Q4 FY2023, iso4217:USD,..."
3,us-gaap:MarketableSecuritiesCurrent,Marketable securities,http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Marketabl...,140,5,"[10-K, 10-Q]",140,"[(355,000,000, 10-Q, Q4 FY2023, iso4217:USD, c..."
4,us-gaap:MarketableSecuritiesNoncurrent,"Marketable Securities, Noncurrent",http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Marketabl...,140,5,"[10-K, 10-Q]",140,"[(120,805,000,000, 10-Q, Q4 FY2023, iso4217:US..."
6,us-gaap:CashAndCashEquivalentsAtCarryingValue,"Cash and Cash Equivalents, at Carrying Value",http://fasb.org/us-gaap/2024,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:CashAndCa...,112,4,"[10-K, 10-Q]",112,"[(0, 10-K, FY2024, iso4217:USD, c-65, FairValu..."
7,us-gaap:MarketableSecuritiesCurrent,"Marketable Securities, Current",http://fasb.org/us-gaap/2024,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:Marketabl...,106,4,"[10-K, 10-Q]",106,"[(271,000,000, 10-K, FY2024, iso4217:USD, c-81..."


## Enhanced Value Analysis with Context

In [13]:
# NOTE: Re-run cells 1-15 first to get the enhanced function with value_tuples
# This cell demonstrates value analysis when using the enhanced get_company_concepts function

# Check if we have the enhanced columns
if 'unique_value_count' in concepts_df.columns:
    # Enhanced filtering with context
    concepts_with_values = concepts_df[concepts_df['unique_value_count'] > 0]
    print(f"Concepts with values: {len(concepts_with_values)}")
    
    # Display sample with full context
    if len(concepts_with_values) > 0:
        sample = concepts_with_values.iloc[0]
        print(f"\n{sample['label']} ({sample['qname']}):")
        print(f"Unique values: {sample['unique_value_count']}")
        print("\nValue Details (value, form, period, unit, context, members):")
        for value_tuple in sample['value_tuples'][:5]:
            value, form, period, unit, context, members = value_tuple
            members_str = f" [{members}]" if members else ""
            print(f"  {period} ({form}): {value} {unit} - Context: {context}{members_str}")
    
    # Filter by specific fiscal year
    fy2024_concepts = concepts_df.copy()
    fy2024_concepts['fy2024_values'] = fy2024_concepts['value_tuples'].apply(
        lambda tuples: [(v,f,p,u,c,m) for v,f,p,u,c,m in tuples if 'FY2024' in p] if tuples else []
    )
    
    # Count concepts with FY2024 data
    concepts_with_fy2024 = fy2024_concepts[fy2024_concepts['fy2024_values'].apply(len) > 0]
    print(f"\nConcepts with FY2024 data: {len(concepts_with_fy2024)}")
    
    # Group by namespace
    namespace_summary = concepts_df.groupby('namespace').agg({
        'qname': 'count',
        'unique_value_count': 'sum'
    }).rename(columns={'qname': 'concept_count', 'unique_value_count': 'total_values'})
    print(f"\nNamespace Summary:\n{namespace_summary.head()}")
else:
    print("Enhanced columns not found. Please re-run the notebook from the beginning.")
    print("The enhanced get_company_concepts function adds 'unique_value_count' and 'value_tuples' columns.")
    print("\nCurrent columns:", concepts_df.columns.tolist())
    
    # Show basic statistics with available columns
    if 'fact_count' in concepts_df.columns:
        print(f"\nTotal concepts: {len(concepts_df)}")
        print(f"Concepts with facts: {len(concepts_df[concepts_df['fact_count'] > 0])}")
        print(f"\nTop 5 concepts by fact count:")
        print(concepts_df.nlargest(5, 'fact_count')[['label', 'fact_count']])

Concepts with values: 1123

Revenues (us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax):
Unique values: 213

Value Details (value, form, period, unit, context, members):
  Q4 FY2023 (10-Q): 156,778,000,000 iso4217:USD - Context: c-63 [IPhone]
  Q4 FY2023 (10-Q): 28,669,000,000 iso4217:USD - Context: c-68 [Mac]
  Q4 FY2023 (10-Q): 162,863,000,000 iso4217:USD - Context: c-64 [IPhone]
  Q4 FY2023 (10-Q): 7,224,000,000 iso4217:USD - Context: c-70 [IPad]
  Q4 FY2023 (10-Q): 60,584,000,000 iso4217:USD - Context: c-13 [Product]

Concepts with FY2024 data: 611

Namespace Summary:
                               concept_count  total_values
namespace                                                 
http://fasb.org/us-gaap/2022             177          1186
http://fasb.org/us-gaap/2023             311          3299
http://fasb.org/us-gaap/2024             297          2577
http://www.apple.com/20221231             15            34
http://www.apple.com/20230401             16            

In [14]:
# Filter by period type
instant_concepts = concepts_df[concepts_df['period_type'] == 'instant']
print(f"Instant period concepts: {len(instant_concepts)}")
instant_concepts.head()

Instant period concepts: 496


Unnamed: 0,qname,label,namespace,balance_type,period_type,concept_period_type,category,concept_type,unique_id,fact_count,report_count,forms_used,unique_value_count,value_tuples
2,us-gaap:CashAndCashEquivalentsAtCarryingValue,"Cash and Cash Equivalents, at Carrying Value",http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:CashAndCa...,140,5,"[10-K, 10-Q]",140,"[(2,929,000,000, 10-Q, Q4 FY2023, iso4217:USD,..."
3,us-gaap:MarketableSecuritiesCurrent,Marketable securities,http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Marketabl...,140,5,"[10-K, 10-Q]",140,"[(355,000,000, 10-Q, Q4 FY2023, iso4217:USD, c..."
4,us-gaap:MarketableSecuritiesNoncurrent,"Marketable Securities, Noncurrent",http://fasb.org/us-gaap/2023,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2023:us-gaap:Marketabl...,140,5,"[10-K, 10-Q]",140,"[(120,805,000,000, 10-Q, Q4 FY2023, iso4217:US..."
6,us-gaap:CashAndCashEquivalentsAtCarryingValue,"Cash and Cash Equivalents, at Carrying Value",http://fasb.org/us-gaap/2024,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:CashAndCa...,112,4,"[10-K, 10-Q]",112,"[(0, 10-K, FY2024, iso4217:USD, c-65, FairValu..."
7,us-gaap:MarketableSecuritiesCurrent,"Marketable Securities, Current",http://fasb.org/us-gaap/2024,debit,instant,instant,Concept,xbrli:monetaryItemType,http://fasb.org/us-gaap/2024:us-gaap:Marketabl...,106,4,"[10-K, 10-Q]",106,"[(271,000,000, 10-K, FY2024, iso4217:USD, c-81..."


## 9. Fetch All Unique Members

In [15]:
# Get all members
members_df = get_company_members(TICKER, FORM_TYPES)
print(f"Total unique members for {TICKER}: {len(members_df)}")

Total unique members for AAPL: 390


In [16]:
# Display first few members
members_df.head(10)

Unnamed: 0,qname,label,level,parent_qname,unique_id,fact_count,report_count
0,us-gaap:FairValueInputsLevel2Member,FairValueInputsLevel2,0,,320193:http://fasb.org/us-gaap/2023:us-gaap:Fa...,658,5
1,us-gaap:FairValueInputsLevel2Member,FairValueInputsLevel2,0,,320193:http://fasb.org/us-gaap/2024:us-gaap:Fa...,512,4
2,us-gaap:FairValueInputsLevel2Member,FairValueInputsLevel2,0,,320193:http://fasb.org/us-gaap/2022:us-gaap:Fa...,274,2
3,us-gaap:FairValueInputsLevel1Member,FairValueInputsLevel1,0,,320193:http://fasb.org/us-gaap/2023:us-gaap:Fa...,210,5
4,us-gaap:FairValueInputsLevel1Member,FairValueInputsLevel1,0,,320193:http://fasb.org/us-gaap/2024:us-gaap:Fa...,168,4
5,us-gaap:RestrictedStockUnitsRSUMember,RestrictedStockUnitsRSU,0,,320193:http://fasb.org/us-gaap/2023:us-gaap:Re...,107,5
6,us-gaap:RetainedEarningsMember,RetainedEarnings,0,,320193:http://fasb.org/us-gaap/2023:us-gaap:Re...,94,5
7,us-gaap:RestrictedStockUnitsRSUMember,RestrictedStockUnitsRSU,0,,320193:http://fasb.org/us-gaap/2024:us-gaap:Re...,87,4
8,us-gaap:CommercialPaperMember,CommercialPaper,0,,320193:http://fasb.org/us-gaap/2023:us-gaap:Co...,84,5
9,us-gaap:FairValueInputsLevel1Member,FairValueInputsLevel1,0,,320193:http://fasb.org/us-gaap/2022:us-gaap:Fa...,84,2


## 10. Fetch All Unique Domains

In [17]:
# Get all domains
domains_df = get_company_domains(TICKER, FORM_TYPES)
print(f"Total unique domains for {TICKER}: {len(domains_df)}")

Total unique domains for AAPL: 65


In [18]:
# Display all domains (usually fewer than members/concepts)
domains_df

Unnamed: 0,qname,label,level,parent_qname,unique_id,member_count
0,us-gaap:ClassOfStockDomain,ClassOfStock,0,,320193:http://fasb.org/us-gaap/2023:us-gaap:Cl...,43
1,us-gaap:ClassOfStockDomain,ClassOfStock,0,,320193:http://fasb.org/us-gaap/2024:us-gaap:Cl...,32
2,us-gaap:SegmentDomain,Segment,0,,320193:http://fasb.org/us-gaap/2023:us-gaap:Se...,25
3,srt:ProductsAndServicesDomain,ProductsAndServices,0,,320193:http://fasb.org/srt/2023:srt:ProductsAn...,22
4,us-gaap:SegmentDomain,Segment,0,,320193:http://fasb.org/us-gaap/2024:us-gaap:Se...,20
...,...,...,...,...,...,...
60,us-gaap:ShareBasedCompensationArrangementsBySh...,ShareBasedCompensationArrangementsByShareBased...,0,,320193:http://fasb.org/us-gaap/2024:us-gaap:Sh...,1
61,us-gaap:ShortTermDebtTypeDomain,ShortTermDebtType,0,,320193:http://fasb.org/us-gaap/2022:us-gaap:Sh...,1
62,us-gaap:ShortTermDebtTypeDomain,ShortTermDebtType,0,,320193:http://fasb.org/us-gaap/2023:us-gaap:Sh...,1
63,us-gaap:ShortTermDebtTypeDomain,ShortTermDebtType,0,,320193:http://fasb.org/us-gaap/2024:us-gaap:Sh...,1


## 11. Summary Statistics

In [19]:
print(f"\n=== XBRL Data Summary for {TICKER} ===")
print(f"Total Unique Concepts: {len(concepts_df)}")
print(f"Total Unique Members: {len(members_df)}")
print(f"Total Unique Domains: {len(domains_df)}")
print(f"\nConcept Namespaces: {concepts_df['namespace'].nunique()}")
print(f"\nTop 5 Most Used Concepts:")
print(concepts_df.nlargest(5, 'fact_count')[['label', 'fact_count']])


=== XBRL Data Summary for AAPL ===
Total Unique Concepts: 1123
Total Unique Members: 390
Total Unique Domains: 65

Concept Namespaces: 19

Top 5 Most Used Concepts:
                                               label  fact_count
0                                           Revenues         213
1  Revenue from Contract with Customer, Excluding...         165
2       Cash and Cash Equivalents, at Carrying Value         140
3                              Marketable securities         140
4                  Marketable Securities, Noncurrent         140


## 13. Close Connection

In [20]:
# # Close the Neo4j driver
# driver.close()
# print("Connection closed.")

## 14. Test Enhanced Function with Value Tuples
Re-run this section to test the enhanced get_company_concepts function that includes value tuples

---
# PILOT TEST: 8-K to XBRL Linking Analysis
---

## Goal: Measure how many facts in Apple's 8-K "Results of Operations" can be correctly linked to existing XBRL concepts

**Approach:**
1. Get sample 8-K Results of Operations report
2. Extract financial metrics manually or via regex
3. Match to concepts from Apple's 10-K/10-Q
4. Sanity check: Compare 8-K values with historical 10-K/10-Q values for same concept+period
5. Measure: Total facts, % linked, % validated

## Next Steps for Improvement

Based on the results above, here are ways to improve linkage and validation rates:

1. **Better Extraction Patterns**: Add more regex patterns to capture different ways metrics are mentioned
2. **Improved Concept Matching**: Use semantic similarity (embeddings) instead of just string matching
3. **Period Extraction**: Extract quarter/year info from 8-K to match with correct period
4. **Member Matching**: Extract product segments, geographies to link with appropriate Members
5. **LangExtract Integration**: Replace regex with proper LLM-based extraction using LangExtract
6. **Multi-Report Analysis**: Run this on multiple 8-Ks to get better statistics

**Key Insight**: The sanity check is crucial - it tells us whether our linkage is actually correct!

In [21]:
# Calculate final metrics
total_extracted = len(matched_df)
total_linked = matched_df['matched'].sum()
total_validated = validated_df['sanity_check'].eq('validated').sum() if not validated_df.empty else 0

link_rate = (total_linked / total_extracted * 100) if total_extracted > 0 else 0
validation_rate = (total_validated / total_linked * 100) if total_linked > 0 else 0
end_to_end_rate = (total_validated / total_extracted * 100) if total_extracted > 0 else 0

print(f"\n{'='*80}")
print(f"FINAL RESULTS: Apple 8-K Results of Operations Analysis")
print(f"{'='*80}\n")

print(f"Report analyzed:")
print(f"  Filing Date: {reports_8k.iloc[0]['filing_date'] if not reports_8k.empty else 'N/A'}")
print(f"  Period: {reports_8k.iloc[0]['period_of_report'] if not reports_8k.empty else 'N/A'}")
print(f"  Section: Results of Operations")
print()

print(f"Extraction Results:")
print(f"  Total facts extracted from 8-K: {total_extracted}")
print(f"  Successfully linked to XBRL concepts: {total_linked} ({link_rate:.1f}%)")
print(f"  Failed to link: {total_extracted - total_linked} ({100-link_rate:.1f}%)")
print()

print(f"Validation Results (Sanity Check):")
print(f"  Facts validated against historical data: {total_validated} ({validation_rate:.1f}% of linked)")
print(f"  Facts that failed validation: {total_linked - total_validated}")
print()

print(f"Overall Accuracy:")
print(f"  End-to-end success rate: {total_validated}/{total_extracted} ({end_to_end_rate:.1f}%)")
print(f"  (Facts both linked AND validated)")
print()

if not validated_df.empty:
    print(f"Sanity Check Breakdown:")
    print(validated_df['sanity_check'].value_counts())
    print()

print(f"{'='*80}\n")

# Show validated facts table
if total_validated > 0:
    print("Successfully Validated Facts:\n")
    validated_subset = validated_df[validated_df['sanity_check'] == 'validated'][[
        'metric_name', 'value_8k_parsed', 'historical_value', 
        'value_difference_pct', 'concept_qname'
    ]]
    print(validated_subset.to_string(index=False))
    print()

NameError: name 'matched_df' is not defined

## STEP 5: Final Metrics Summary

In [None]:
def parse_value_to_number(value_text):
    """
    Convert value text like '119.6 billion' or '$94.9 million' to absolute number
    """
    value_text = value_text.replace(',', '').replace('$', '').strip()
    
    # Extract number and multiplier
    match = re.match(r'([\d\.]+)\s*(billion|million|thousand)?', value_text, re.IGNORECASE)
    if not match:
        return None
    
    number = float(match.group(1))
    multiplier = match.group(2)
    
    if multiplier:
        if multiplier.lower() == 'billion':
            number *= 1_000_000_000
        elif multiplier.lower() == 'million':
            number *= 1_000_000
        elif multiplier.lower() == 'thousand':
            number *= 1_000
    
    return number

def get_historical_values_for_concept(concept_qname, ticker='AAPL'):
    """
    Query Neo4j to get historical values for this concept from 10-K/10-Q
    """
    query = """
    MATCH (c:Company {ticker: $ticker})-[:FILED]->(r:Report)
    -[:HAS_XBRL]->(x:XBRLNode)<-[:REPORTS]-(f:Fact)
    -[:HAS_CONCEPT]->(concept:Concept {qname: $concept_qname})
    WHERE r.formType IN ['10-K', '10-Q'] AND f.value IS NOT NULL
    OPTIONAL MATCH (f)-[:HAS_PERIOD]->(p:Period)
    OPTIONAL MATCH (f)-[:HAS_UNIT]->(u:Unit)
    RETURN f.value as value,
           r.formType as form_type,
           r.periodOfReport as report_date,
           p.start_date as period_start,
           p.end_date as period_end,
           u.name as unit
    ORDER BY r.periodOfReport DESC
    LIMIT 20
    """
    
    with driver.session() as session:
        result = session.run(query, ticker=ticker, concept_qname=concept_qname)
        return pd.DataFrame([dict(record) for record in result])

def compare_values(value_8k, value_historical, tolerance=0.1):
    """
    Compare two values and return whether they're similar (within tolerance)
    tolerance: 0.1 = 10% difference allowed
    """
    if value_8k is None or value_historical is None:
        return False, float('inf')
    
    # Handle zero values
    if value_historical == 0:
        return value_8k == 0, abs(value_8k)
    
    diff_ratio = abs(value_8k - value_historical) / abs(value_historical)
    is_similar = diff_ratio <= tolerance
    
    return is_similar, diff_ratio

# Perform sanity check on matched facts
print(f"{'='*80}")
print(f"SANITY CHECK: Comparing 8-K values with historical 10-K/10-Q data")
print(f"{'='*80}\n")

validated_facts = []
report_period = reports_8k.iloc[0]['period_of_report'] if not reports_8k.empty else None

for idx, row in matched_df[matched_df['matched']].iterrows():
    # Parse 8-K value
    value_8k = parse_value_to_number(row['value_text'])
    
    if value_8k is None:
        print(f"⚠ Could not parse value: {row['value_text']}")
        validated_facts.append({
            **row,
            'sanity_check': 'parse_error',
            'value_8k_parsed': None,
            'historical_match_found': False
        })
        continue
    
    # Get historical values for this concept
    historical_df = get_historical_values_for_concept(row['concept_qname'])
    
    if historical_df.empty:
        print(f"⚠ No historical data for {row['concept_qname']}")
        validated_facts.append({
            **row,
            'sanity_check': 'no_historical_data',
            'value_8k_parsed': value_8k,
            'historical_match_found': False
        })
        continue
    
    # Try to find matching value in historical data
    best_match = None
    best_diff = float('inf')
    
    for _, hist_row in historical_df.iterrows():
        try:
            hist_value = float(hist_row['value'])
            is_similar, diff_ratio = compare_values(value_8k, hist_value, tolerance=0.05)
            
            if is_similar and diff_ratio < best_diff:
                best_match = hist_row
                best_diff = diff_ratio
        except (ValueError, TypeError):
            continue
    
    if best_match is not None:
        print(f"✓ VALIDATED: {row['metric_name']}")
        print(f"  8-K Value: ${value_8k:,.0f}")
        print(f"  Historical: ${float(best_match['value']):,.0f} ({best_match['form_type']} {best_match['report_date']})")
        print(f"  Difference: {best_diff*100:.2f}%")
        print(f"  Concept: {row['concept_qname']}")
        print()
        
        validated_facts.append({
            **row,
            'sanity_check': 'validated',
            'value_8k_parsed': value_8k,
            'historical_value': float(best_match['value']),
            'historical_report_date': best_match['report_date'],
            'historical_form_type': best_match['form_type'],
            'value_difference_pct': best_diff * 100,
            'historical_match_found': True
        })
    else:
        print(f"✗ NO MATCH: {row['metric_name']}")
        print(f"  8-K Value: ${value_8k:,.0f}")
        print(f"  Closest historical: ${float(historical_df.iloc[0]['value']):,.0f}")
        print(f"  Concept: {row['concept_qname']}")
        print()
        
        validated_facts.append({
            **row,
            'sanity_check': 'no_value_match',
            'value_8k_parsed': value_8k,
            'historical_value': float(historical_df.iloc[0]['value']) if not historical_df.empty else None,
            'historical_match_found': False
        })

validated_df = pd.DataFrame(validated_facts)

## STEP 4: Sanity Check - Compare 8-K Values with Historical 10-K/10-Q Data

For each linked fact, query the historical XBRL data to find facts with same concept and compare values.
This validates whether our linkage is correct.

In [None]:
def normalize_metric_for_matching(metric_text):
    """Normalize metric text for fuzzy matching"""
    normalized = metric_text.lower().strip()
    # Remove common prefixes/suffixes that may vary
    normalized = re.sub(r'\s*\(loss\)\s*', '', normalized)
    normalized = re.sub(r'\s+', ' ', normalized)
    return normalized

def match_fact_to_concept(fact, concepts_df, debug=False):
    """
    Try to match an extracted fact to an XBRL concept.
    Returns: (matched_concept_row, match_type, confidence)
    """
    metric = normalize_metric_for_matching(fact['metric_name'])
    
    # Strategy 1: Exact label match
    exact_matches = concepts_df[
        concepts_df['label'].str.lower().str.strip() == metric
    ]
    if not exact_matches.empty:
        return exact_matches.iloc[0], 'exact_label', 1.0
    
    # Strategy 2: Label contains metric or vice versa
    contains_matches = concepts_df[
        concepts_df['label'].str.lower().str.contains(metric, regex=False, na=False) |
        concepts_df['label'].str.lower().apply(lambda x: metric in x if pd.notna(x) else False)
    ]
    if not contains_matches.empty:
        if debug:
            print(f"  Found {len(contains_matches)} contains matches for '{metric}'")
        return contains_matches.iloc[0], 'label_contains', 0.8
    
    # Strategy 3: qname local part fuzzy match
    metric_clean = metric.replace(' ', '').replace('_', '')
    
    qname_matches = concepts_df[
        concepts_df['qname'].str.lower().str.replace(' ', '').str.replace('_', '').str.contains(metric_clean, regex=False, na=False)
    ]
    if not qname_matches.empty:
        if debug:
            print(f"  Found {len(qname_matches)} qname matches for '{metric_clean}'")
        return qname_matches.iloc[0], 'qname_fuzzy', 0.6
    
    # Strategy 4: Common aliases
    aliases = {
        'revenue': ['revenues', 'net sales', 'revenue from contract'],
        'net income': ['net earnings', 'profit', 'income after tax'],
        'diluted eps': ['diluted earnings per share', 'earnings per share diluted'],
        'operating income': ['income from operations', 'operating profit'],
    }
    
    for canonical, synonyms in aliases.items():
        if any(syn in metric for syn in synonyms) or canonical in metric:
            alias_matches = concepts_df[
                concepts_df['label'].str.lower().str.contains(canonical, regex=False, na=False)
            ]
            if not alias_matches.empty:
                if debug:
                    print(f"  Found alias match via '{canonical}'")
                return alias_matches.iloc[0], 'alias', 0.7
    
    return None, 'no_match', 0.0

# Match all extracted facts
matched_facts = []

for i, fact in enumerate(extracted_facts):
    match, match_type, confidence = match_fact_to_concept(fact, concepts_df, debug=False)
    
    matched_fact = {
        **fact,
        'matched': match is not None,
        'match_type': match_type,
        'match_confidence': confidence,
        'concept_qname': match['qname'] if match is not None else None,
        'concept_label': match['label'] if match is not None else None,
        'concept_period_type': match['period_type'] if match is not None else None,
    }
    
    matched_facts.append(matched_fact)

matched_df = pd.DataFrame(matched_facts)

# Summary statistics
total_facts = len(matched_df)
linked_facts = matched_df['matched'].sum()
link_rate = (linked_facts / total_facts * 100) if total_facts > 0 else 0

print(f"{'='*80}")
print(f"MATCHING RESULTS")
print(f"{'='*80}")
print(f"Total facts extracted: {total_facts}")
print(f"Successfully linked: {linked_facts} ({link_rate:.1f}%)")
print(f"Failed to link: {total_facts - linked_facts} ({100-link_rate:.1f}%)")
print(f"\nMatch types:")
print(matched_df[matched_df['matched']]['match_type'].value_counts())
print(f"\n{'='*80}\n")

# Show matched facts
print("Sample of matched facts:\n")
for idx, row in matched_df[matched_df['matched']].head(10).iterrows():
    print(f"✓ '{row['metric_name']}' → {row['concept_qname']}")
    print(f"  Label: {row['concept_label']}")
    print(f"  Match: {row['match_type']} (confidence: {row['match_confidence']})")
    print()

## STEP 3: Match Extracted Facts to XBRL Concepts

For each extracted fact, try to match its metric_name to a Concept from Apple's 10-K/10-Q

In [None]:
def extract_facts_from_8k(text):
    """
    Extract financial facts from 8-K text using regex patterns.
    Returns list of dicts with: metric_name, value_text, context_snippet, span_start, span_end
    """
    facts = []
    
    # Pattern: Look for common financial statement terms followed by values
    # Example: "quarterly revenue of $119.6 billion" or "Net income was $36.3 billion"
    
    patterns = [
        # Pattern 1: "METRIC of $VALUE" or "METRIC was $VALUE"
        r'(?P<metric>(?:total |net |operating |gross )?(?:revenue|income|earnings|sales|profit|loss|margin|eps|assets|liabilities|equity|cash|debt)(?:\s+\(loss\))?)\s+(?:of|was|were|totaled?|at)\s+\$?(?P<value>[\d,\.]+\s*(?:billion|million|thousand)?)',
        
        # Pattern 2: "$VALUE in METRIC"
        r'\$(?P<value>[\d,\.]+\s*(?:billion|million|thousand)?)\s+(?:in|of|for)\s+(?P<metric>(?:total |net |operating |gross )?(?:revenue|income|earnings|sales|profit|loss))',
        
        # Pattern 3: Diluted EPS patterns
        r'(?P<metric>diluted earnings per share|diluted eps|earnings per diluted share)\s+(?:of|was|were)?\s*\$?(?P<value>[\d,\.]+)',
    ]
    
    for pattern in patterns:
        for match in re.finditer(pattern, text, re.IGNORECASE):
            metric_name = match.group('metric').strip()
            value_text = match.group('value').strip()
            
            # Get surrounding context (50 chars before and after)
            start = max(0, match.start() - 50)
            end = min(len(text), match.end() + 50)
            context = text[start:end]
            
            facts.append({
                'metric_name': metric_name,
                'value_text': value_text,
                'value_full': match.group(0),
                'context_snippet': context,
                'span_start': match.start(),
                'span_end': match.end(),
                'pattern_used': pattern[:50]  # Track which pattern matched
            })
    
    return facts

# Extract facts from most recent 8-K
if not reports_8k.empty:
    content = reports_8k.iloc[0]['content']
    extracted_facts = extract_facts_from_8k(content)
    
    print(f"Extracted {len(extracted_facts)} facts from 8-K\n")
    
    # Show first 10 facts
    for i, fact in enumerate(extracted_facts[:10], 1):
        print(f"{i}. Metric: {fact['metric_name']}")
        print(f"   Value: {fact['value_text']}")
        print(f"   Full: {fact['value_full']}")
        print(f"   Context: ...{fact['context_snippet']}...")
        print()

## STEP 2: Extract Financial Facts from 8-K Text

Using simple regex patterns to find:
- Dollar amounts (e.g., "$119.6 billion", "$94.9 million")
- Percentages (e.g., "6%", "45.2%")
- Share counts
- Dates/periods

In [None]:
# Get Apple's 8-K "Results of Operations" reports
query_8k = """
MATCH (c:Company {ticker: $ticker})-[:FILED]->(r:Report {formType: '8-K'})
-[:HAS_SECTION]->(s:ExtractedSectionContent)
WHERE s.section_name CONTAINS 'ResultsofOperations'
WITH r, s
ORDER BY r.filingDate DESC
RETURN r.filing_id as filing_id,
       r.filingDate as filing_date,
       r.periodOfReport as period_of_report,
       s.id as section_id,
       s.section_name as section_name,
       s.content as content,
       length(s.content) as content_length
LIMIT 5
"""

with driver.session() as session:
    result = session.run(query_8k, ticker='AAPL')
    reports_8k = pd.DataFrame([dict(record) for record in result])

print(f"Found {len(reports_8k)} 8-K Results of Operations reports")
if not reports_8k.empty:
    print(f"\nMost recent report:")
    print(f"  Filing Date: {reports_8k.iloc[0]['filing_date']}")
    print(f"  Period: {reports_8k.iloc[0]['period_of_report']}")
    print(f"  Content Length: {reports_8k.iloc[0]['content_length']} chars")
    print(f"\nFirst 1000 chars of content:")
    print("="*80)
    print(reports_8k.iloc[0]['content'][:1000])
    print("="*80)

In [None]:
# Reconnect to Neo4j if driver was closed
import pandas as pd
from neo4j import GraphDatabase
import os
import re
from datetime import datetime

uri = "bolt://localhost:30687"
username = "neo4j"
password = os.getenv('NEO4J_PASSWORD', 'your_password')

# Create new driver if needed
try:
    driver.verify_connectivity()
    print("✓ Driver already connected")
except:
    driver = GraphDatabase.driver(uri, auth=(username, password))
    print("✓ New driver created")

# Reload concepts if needed
if 'concepts_df' not in dir() or concepts_df.empty:
    concepts_df = get_company_concepts('AAPL', ['10-K', '10-Q'])
    print(f"✓ Reloaded {len(concepts_df)} concepts")
else:
    print(f"✓ Using existing {len(concepts_df)} concepts")

## STEP 1: Reconnect to Neo4j (if needed) and Get Sample 8-K