In [1]:
import langextract as lx
import textwrap
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [None]:
# Set LANGEXTRACT_API_KEY if not already set
if not os.getenv('LANGEXTRACT_API_KEY'):
    os.environ['LANGEXTRACT_API_KEY'] = os.getenv('GEMINI_API_KEY')

# 1. Define the prompt for financial data extraction (XBRL-like concepts)
prompt = textwrap.dedent("""\
    Extract financial entities, metrics, and relationships from text.
    Focus on:
    - XBRL concepts: company names, tickers, financial items
    - Facts: numerical values, dates, percentages
    - Descriptions: analyst ratings, business segments, financial conditions
    
    Use exact text for extractions. Preserve numerical precision.
    Provide meaningful attributes like units, periods, and context.""")

# 2. Provide financial data examples to guide the model
examples = [
    lx.data.ExampleData(
        text="Apple Inc. (NASDAQ: AAPL) reported revenue of $89.5 billion for Q4 2023, representing a 2% year-over-year decline.",
        extractions=[
            lx.data.Extraction(
                extraction_class="company",
                extraction_text="Apple Inc.",
                attributes={"ticker": "AAPL", "exchange": "NASDAQ"}
            ),
            lx.data.Extraction(
                extraction_class="xbrl_concept",
                extraction_text="revenue",
                attributes={"concept": "Revenue", "gaap_item": "us-gaap:Revenues"}
            ),
            lx.data.Extraction(
                extraction_class="fact",
                extraction_text="$89.5 billion",
                attributes={"value": 89500000000, "unit": "USD", "decimals": 9}
            ),
            lx.data.Extraction(
                extraction_class="period",
                extraction_text="Q4 2023",
                attributes={"fiscal_period": "Q4", "fiscal_year": 2023}
            ),
            lx.data.Extraction(
                extraction_class="fact",
                extraction_text="2% year-over-year decline",
                attributes={"value": -0.02, "unit": "percent", "comparison": "YoY"}
            ),
        ]
    ),
    lx.data.ExampleData(
        text="The company's gross margin improved to 45.2% from 43.1% in the prior quarter, driven by product mix optimization.",
        extractions=[
            lx.data.Extraction(
                extraction_class="xbrl_concept",
                extraction_text="gross margin",
                attributes={"concept": "GrossMargin", "gaap_item": "us-gaap:GrossProfitMargin"}
            ),
            lx.data.Extraction(
                extraction_class="fact",
                extraction_text="45.2%",
                attributes={"value": 0.452, "unit": "percent", "period": "current_quarter"}
            ),
            lx.data.Extraction(
                extraction_class="fact",
                extraction_text="43.1%",
                attributes={"value": 0.431, "unit": "percent", "period": "prior_quarter"}
            ),
            lx.data.Extraction(
                extraction_class="description",
                extraction_text="driven by product mix optimization",
                attributes={"type": "reason", "impact": "positive"}
            ),
        ]
    )
]

In [3]:
# The input text to be processed - using real financial data
input_text = """body: "Marathon Petroleum (NYSE: MPC ) has observed the following analyst ratings within the last quarter: 
Bullish Somewhat Bullish Indifferent Somewhat Bearish Bearish Total Ratings 2 4 4 0 0 Last 30D 0 1 0 0 0 1M Ago 0 0 1 0 0 2M Ago 0 0 1 0 0 3M Ago 2 3 2 0 0 
According to 10 analyst offering 12-month price targets in the last 3 months, Marathon Petroleum has an average price target of $134.0 with a high of $153.00 
and a low of $116.00. Below is a summary of how these 10 analysts rated Marathon Petroleum over the past 3 months. 
The greater the number of bullish ratings, the more positive analysts are on the stock and the greater the number of bearish ratings, 
the more negative analysts are on the stock This average price target has increased by 6.72% over the past month. Stay up to date on Marathon Petroleum analyst ratings. 
What Are Analyst Ratings? Ratings come from analysts, or specialists within banking and financial systems that report for specific stocks or 
defined sectors (typically once per quarter for each stock). Analysts usually derive their information from company conference calls and meetings,
financial statements, and conversations with important insiders to reach their decisions. Some analysts will also offer forecasts for metrics like growth estimates,
earnings, and revenue to provide further guidance on stocks. Investors who use analyst ratings should note that this specialized advice comes from humans 
and may be subject to error. This article was generated by Benzinga's automated content engine and reviewed by an editor."""

# Run the extraction (API key is now loaded from environment)
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
)

print("✅ Extraction completed successfully!")

[94m[1mLangExtract[0m: model=[92mgemini-2.5-flash[0m, current=[92m1,574[0m chars, processed=[92m1,574[0m chars:  [00:00]

[94m[1mLangExtract[0m: model=[92mgemini-2.5-flash[0m, current=[92m1,574[0m chars, processed=[92m1,574[0m chars:  [00:26]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m48[0m entities ([1m4[0m unique types)
  [96m•[0m Time: [1m26.22s[0m
  [96m•[0m Speed: [1m60[0m chars/sec
  [96m•[0m Chunks: [1m2[0m
✅ Extraction completed successfully!





In [4]:
# Display the results in a structured format
print("=" * 70)
print("FINANCIAL DATA EXTRACTION RESULTS")
print("=" * 70)
print()

# Group extractions by class
from collections import defaultdict
grouped = defaultdict(list)
for extraction in result.extractions:
    grouped[extraction.extraction_class].append(extraction)

# Display each class of extractions
for class_name, extractions in grouped.items():
    print(f"📊 {class_name.upper()}")
    print("-" * 40)
    for ext in extractions:
        print(f"  Text: '{ext.extraction_text}'")
        if ext.attributes:  # Check if attributes exist
            for key, value in ext.attributes.items():
                print(f"    • {key}: {value}")
    print()

# Summary statistics
print("=" * 70)
print(f"SUMMARY: Extracted {len(result.extractions)} items across {len(grouped)} categories")
print("=" * 70)

FINANCIAL DATA EXTRACTION RESULTS

📊 COMPANY
----------------------------------------
  Text: 'Marathon Petroleum'
    • ticker: MPC
    • exchange: NYSE

📊 XBRL_CONCEPT
----------------------------------------
  Text: 'Analyst Ratings'
  Text: 'Bullish Ratings Count'
  Text: 'Somewhat Bullish Ratings Count'
  Text: 'Indifferent Ratings Count'
  Text: 'Somewhat Bearish Ratings Count'
  Text: 'Bearish Ratings Count'
  Text: 'Total Analyst Ratings'
  Text: 'Somewhat Bullish Ratings Count'
  Text: 'Indifferent Ratings Count'
  Text: 'Indifferent Ratings Count'
  Text: 'Bullish Ratings Count'
  Text: 'Somewhat Bullish Ratings Count'
  Text: 'Indifferent Ratings Count'
  Text: 'Number of Analysts'
  Text: 'Average Price Target'
    • concept: AnalystAveragePriceTarget
  Text: 'High Price Target'
    • concept: AnalystHighPriceTarget
  Text: 'Low Price Target'
    • concept: AnalystLowPriceTarget
  Text: 'Average Price Target'
    • concept: AnalystAveragePriceTarget
  Text: 'Ratings'
    • 

In [5]:
# Example 2: Process a longer financial document (analyst report)
longer_text = """
extracted_sections: "{"DepartureofDirectorsorCertainOfficers;ElectionofDirectors;AppointmentofCertainOfficers:CompensatoryArrangementsofCertainOfficers": " Item 5.02 Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers; Compensatory Arrangements of Certain Officers. \n\nOn January 5, 2023, Apellis Pharmaceuticals, Inc., a Delaware corporation (the \u201cCompany\u201d), announced that Federico Grossi, M.D., Ph.D., the Chief Medical Officer of the Company, will be leaving the Company to pursue new career opportunities. In connection with the announcement of Dr. Grossi\u2019s departure, the Company also announced that Caroline Baumal, M.D. will join the Company as Chief Medical Officer, effective as of January 3, 2023. Dr. Grossi will cease to serve as Chief Medical Officer effective as of January 3, 2023 and will continue at the Company until February 28, 2023 to help with the transition and preparations for the potential launch of pegcetacoplan for geographic atrophy. \n\n \n\n"}"
"""

# Extract from longer document
result2 = lx.extract(
    text_or_documents=longer_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
)

print("\n" + "=" * 70)
print("ANALYST REPORT EXTRACTION")
print("=" * 70)

# Display key metrics found
facts = [ext for ext in result2.extractions if ext.extraction_class == "fact"]
concepts = [ext for ext in result2.extractions if ext.extraction_class == "xbrl_concept"]
companies = [ext for ext in result2.extractions if ext.extraction_class == "company"]

print(f"\n📈 Found {len(companies)} companies, {len(concepts)} XBRL concepts, {len(facts)} numerical facts")
print("\nKey Facts Extracted:")
for fact in facts[:5]:  # Show first 5 facts
    print(f"  • {fact.extraction_text}")
    if fact.attributes and 'value' in fact.attributes:
        print(f"    Value: {fact.attributes['value']}")
        
print("\nXBRL Concepts Identified:")
for concept in concepts[:5]:
    print(f"  • {concept.extraction_text}")
    if concept.attributes and 'gaap_item' in concept.attributes:
        print(f"    GAAP: {concept.attributes['gaap_item']}")

[94m[1mLangExtract[0m: model=[92mgemini-2.5-flash[0m, current=[92m1,018[0m chars, processed=[92m1,018[0m chars:  [00:00]

[94m[1mLangExtract[0m: model=[92mgemini-2.5-flash[0m, current=[92m1,018[0m chars, processed=[92m1,018[0m chars:  [00:12]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m10[0m entities ([1m4[0m unique types)
  [96m•[0m Time: [1m12.56s[0m
  [96m•[0m Speed: [1m81[0m chars/sec
  [96m•[0m Chunks: [1m2[0m

ANALYST REPORT EXTRACTION

📈 Found 1 companies, 0 XBRL concepts, 2 numerical facts

Key Facts Extracted:
  • January 5, 2023
    Value: 2023-01-05
  • January 3, 2023
    Value: 2023-01-03

XBRL Concepts Identified:





In [6]:
# Example 3: Extract XBRL-like data from earnings call transcript
earnings_call_text = """
CEO John Smith: "Thank you for joining our Q4 2024 earnings call. I'm pleased to report that 
Microsoft Corporation achieved record revenue of $62.0 billion this quarter, up 18% year-over-year. 
Our cloud segment, Azure, grew 30% and now represents 40% of total revenue. 

Operating income increased to $27.0 billion with an operating margin of 43.5%, up from 42.1% last year.
We returned $9.7 billion to shareholders through dividends and share repurchases.

Looking ahead to FY2025, we expect revenue growth of 15-17% and plan to invest $50 billion in AI infrastructure."
"""

# Extract with focus on speaker attribution and forward-looking statements
result3 = lx.extract(
    text_or_documents=earnings_call_text,
    prompt_description=prompt + "\nAlso identify speakers and forward-looking statements.",
    examples=examples,
    model_id="gemini-2.5-flash",
)

print("\n" + "=" * 70)
print("EARNINGS CALL EXTRACTION FOR NEO4J STORAGE")
print("=" * 70)

# Prepare data for Neo4j node creation (similar to XBRL structure)
print("\n🔷 Neo4j Node Structure Preview:")
print("-" * 40)

for i, ext in enumerate(result3.extractions[:8], 1):
    node_data = {
        "id": f"extract_{i}",
        "class": ext.extraction_class,
        "text": ext.extraction_text
    }
    # Only add attributes if they exist
    if ext.attributes:
        node_data.update(ext.attributes)
    
    print(f"\nNode {i}: {ext.extraction_class.upper()}")
    print(f"  Properties: {node_data}")
    
    # Show potential relationships
    if ext.extraction_class == "fact" and ext.attributes and "concept" in str(ext.attributes):
        print(f"  → Relationship: HAS_FACT (links to XBRL concept)")
    elif ext.extraction_class == "company":
        print(f"  → Relationship: REPORTS (links to facts)")

print("\n" + "=" * 70)
print("Ready for Neo4j ingestion via EventMarketDB pipeline")
print("=" * 70)

[94m[1mLangExtract[0m: model=[92mgemini-2.5-flash[0m, current=[92m573[0m chars, processed=[92m573[0m chars:  [00:12]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m23[0m entities ([1m5[0m unique types)
  [96m•[0m Time: [1m12.07s[0m
  [96m•[0m Speed: [1m48[0m chars/sec
  [96m•[0m Chunks: [1m1[0m

EARNINGS CALL EXTRACTION FOR NEO4J STORAGE

🔷 Neo4j Node Structure Preview:
----------------------------------------

Node 1: COMPANY
  Properties: {'id': 'extract_1', 'class': 'company', 'text': 'Microsoft Corporation'}
  → Relationship: REPORTS (links to facts)

Node 2: XBRL_CONCEPT
  Properties: {'id': 'extract_2', 'class': 'xbrl_concept', 'text': 'revenue', 'concept': 'Revenue', 'gaap_item': 'us-gaap:Revenues'}

Node 3: FACT
  Properties: {'id': 'extract_3', 'class': 'fact', 'text': '$62.0 billion', 'value': '62000000000', 'unit': 'USD', 'decimals': '9'}

Node 4: PERIOD
  Properties: {'id': 'extract_4', 'class': 'period', 'text': 'this quarter', 'fiscal_period': 'Q4', 'fiscal_year': '2024'}

Node 5: FACT
  Properties: {'id': 'extract_5', 'class': 'fact', 'text': '18% yea




In [7]:
# Save extractions in a format ready for EventMarketDB/Neo4j ingestion
import json
from datetime import datetime

def prepare_for_neo4j(extractions, source_doc="analyst_report", company_ticker="MPC"):
    """
    Convert langextract results to EventMarketDB Neo4j format
    Similar to XBRL node structure: (Report)-[:HAS_XBRL]->(XBRLNode)-[:HAS_FACT]->(Fact)
    """
    neo4j_data = {
        "document": {
            "type": "langextract_document",
            "source": source_doc,
            "extracted_at": datetime.now().isoformat(),
            "company_ticker": company_ticker
        },
        "nodes": [],
        "relationships": []
    }
    
    for i, ext in enumerate(extractions):
        node_id = f"{company_ticker}_{source_doc}_{i}"
        
        # Create node data - handle None attributes
        node_properties = {
            "text": ext.extraction_text,
            "extraction_class": ext.extraction_class
        }
        # Only add attributes if they exist
        if ext.attributes:
            node_properties.update(ext.attributes)
        
        node = {
            "node_id": node_id,
            "labels": [ext.extraction_class.upper()],
            "properties": node_properties
        }
        neo4j_data["nodes"].append(node)
        
        # Create relationships based on extraction class
        if ext.extraction_class == "fact":
            neo4j_data["relationships"].append({
                "type": "HAS_FACT",
                "from": "document",
                "to": node_id
            })
        elif ext.extraction_class == "xbrl_concept":
            neo4j_data["relationships"].append({
                "type": "DEFINES_CONCEPT", 
                "from": "document",
                "to": node_id
            })
    
    return neo4j_data

# Convert our extractions
neo4j_ready = prepare_for_neo4j(result.extractions, "analyst_ratings", "MPC")

print("📁 Data prepared for Neo4j ingestion:")
print("=" * 50)
print(f"Document: {neo4j_ready['document']['source']}")
print(f"Company: {neo4j_ready['document']['company_ticker']}")
print(f"Nodes to create: {len(neo4j_ready['nodes'])}")
print(f"Relationships to create: {len(neo4j_ready['relationships'])}")
print("\nSample Node:")
if neo4j_ready['nodes']:
    print(json.dumps(neo4j_ready['nodes'][0], indent=2))

# Save to file for batch processing
output_file = f"/tmp/langextract_{neo4j_ready['document']['company_ticker']}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, 'w') as f:
    json.dump(neo4j_ready, f, indent=2)
print(f"\n✅ Saved to: {output_file}")

📁 Data prepared for Neo4j ingestion:
Document: analyst_ratings
Company: MPC
Nodes to create: 48
Relationships to create: 39

Sample Node:
{
  "node_id": "MPC_analyst_ratings_0",
  "labels": [
    "COMPANY"
  ],
  "properties": {
    "text": "Marathon Petroleum",
    "extraction_class": "company",
    "ticker": "MPC",
    "exchange": "NYSE"
  }
}

✅ Saved to: /tmp/langextract_MPC_20250804_144909.json
