In [None]:
import langextract as lx
import pymupdf4llm
import json
from pathlib import Path
import textwrap
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["LANGEXTRACT_API_KEY"] = os.getenv("GOOGLE_API_KEY")

### Analysis of TJA1051 Datasheet

Key observations from the datasheet:
1. **Table 1 (Page 2)**: Quick reference data with Min/Typ/Max values
2. **Table 3 (Page 8-9)**: Detailed static characteristics with conditions
3. **Multiple part numbers**: TJA1051T, TJA1051T/3, TJA1051T/E, TJA1051TK/3
4. **Values often have conditions**: e.g., "Normal mode; bus recessive"
5. **Some parameters appear in multiple places with different contexts**

## Understanding LangExtract Fields

### The Three Key Fields Explained:

#### 1. **extraction_class** (Category/Type)
- **Purpose**: Defines WHAT TYPE of thing you're extracting
- **Think of it as**: The label or category for the extracted information
- **Examples**: 
  - "supply_voltage", "temperature_range", "package_type"
  - "person", "location", "date"
  - "medication", "dosage", "side_effect"

#### 2. **extraction_text** (Exact Text)
- **Purpose**: The EXACT text from the source that contains the information
- **Critical**: Must be verbatim from the source - no paraphrasing!
- **Examples**:
  - "4.5 - 5.5 V" (exactly as it appears)
  - "−40°C to +125°C" (including special characters)
  - "SO8" (precise package name)

#### 3. **attributes** (Structured Details)
- **Purpose**: The PARSED/STRUCTURED information from the extracted text
- **Think of it as**: Breaking down the extracted text into meaningful parts
- **Examples**:
  - `{"min": "4.5", "max": "5.5", "unit": "V"}`
  - `{"parameter": "VCC", "condition": "Silent mode"}`
  - `{"part_number": "TJA1051T", "package": "SO8"}`

### Visual Example:
```
Source text: "VCC supply voltage 4.5 - 5.5 V"
                    ↓
extraction_class: "supply_voltage"  ← What type of info?
extraction_text: "4.5 - 5.5 V"      ← Exact text extracted
attributes: {                       ← Structured breakdown
    "parameter": "VCC",
    "min": "4.5",
    "max": "5.5", 
    "unit": "V"
}
```

### Why This Structure?

1. **Source Grounding**: `extraction_text` ensures every extraction can be traced back to the exact source
2. **Consistency**: `extraction_class` helps the model understand patterns across similar extractions
3. **Flexibility**: `attributes` allows rich, structured data without losing the source connection
4. **Schema Control**: The model learns from your examples to maintain consistent output structure

### Common Patterns for Datasheets:

| extraction_class | extraction_text | attributes |
|-----------------|-----------------|------------|
| supply_voltage | "4.5 - 5.5 V" | min, max, unit |
| supply_current | "0.1 1 2.5 mA" | min, typ, max, unit, condition |
| temperature_range | "-40°C to +125°C" | min, max, unit |
| package_type | "SO8" | package, description |
| data_rate | "up to 5 Mbit/s" | max, unit, protocol |

## Testing extraction_text Flexibility

### Key Question: How exact does extraction_text need to be?

The `extraction_text` field requires **EXACT matching** from the source - it's not about pattern matching, but about extracting the precise text. However, the model learns from your examples to recognize **different formats** that represent the same type of information.

### Step 1: Convert PDF to Markdown

In [None]:
# Using TJA1051 CAN transceiver datasheet as example
pdf_path = "/Users/qingye/Documents/analytics-env/TJA1051.pdf"

# Convert PDF to markdown
md_text = pymupdf4llm.to_markdown(
    pdf_path,
)

# Preview first 2000 characters
print(f"PDF converted to {len(md_text)} characters")
print("\nFirst 2000 characters of markdown:")
print(md_text[:2000])

### Step 2: Define Parameters to Extract

For technical datasheets, we typically want to extract:
- Operating voltage ranges
- Temperature ranges
- Current consumption
- Package types
- Key features
- Pin configurations

In [None]:
# Create extraction examples for common datasheet parameters
examples = [
    lx.data.ExampleData(
        text="The device operates from 4.5V to 5.5V supply voltage",
        extractions=[
            lx.data.Extraction(
                extraction_class="operating_voltage",
                extraction_text="4.5V to 5.5V",
                attributes={"min": "4.5V", "max": "5.5V"},
            )
        ],
    ),
    lx.data.ExampleData(
        text="Operating temperature range: -40°C to +125°C",
        extractions=[
            lx.data.Extraction(
                extraction_class="temperature_range",
                extraction_text="-40°C to +125°C",
                attributes={"min": "-40°C", "max": "+125°C"},
            )
        ],
    ),
    lx.data.ExampleData(
        text="Supply current in normal mode is typically 10mA, maximum 15mA",
        extractions=[
            lx.data.Extraction(
                extraction_class="supply_current",
                extraction_text="typically 10mA, maximum 15mA",
                attributes={"typical": "10mA", "max": "15mA", "mode": "normal"},
            )
        ],
    ),
    lx.data.ExampleData(
        text="Available in SO8 and DIP8 packages",
        extractions=[
            lx.data.Extraction(
                extraction_class="package_type",
                extraction_text="SO8",
                attributes={"type": "surface_mount"},
            ),
            lx.data.Extraction(
                extraction_class="package_type",
                extraction_text="DIP8",
                attributes={"type": "through_hole"},
            ),
        ],
    ),
    lx.data.ExampleData(
        text="High-speed CAN transceiver supports data rates up to 1 Mbit/s",
        extractions=[
            lx.data.Extraction(
                extraction_class="data_rate",
                extraction_text="up to 1 Mbit/s",
                attributes={"max": "1 Mbit/s", "protocol": "CAN"},
            )
        ],
    ),
]

# Define what we want to extract
prompt_description = """
Extract technical specifications from electronic component datasheets:
1. operating_voltage: Supply voltage specifications (min/max/typical values)
2. temperature_range: Operating temperature specifications
3. supply_current: Current consumption in different operating modes
4. package_type: Available IC package options
5. data_rate: Communication speed or baud rate specifications
6. protection_feature: ESD, EMC, or other protection specifications
"""

### Step 3: Extract Parameters from Datasheet

In [None]:
result = lx.extract(
    text_or_documents=md_text,  # Limit to first 50k chars for cost control
    prompt_description=prompt_description,
    examples=examples,
    model_id="gemini-2.5-flash",
    temperature=0.0,
    extraction_passes=1,  # Improves recall through multiple passes
    batch_length=10,
    max_workers=20,
    max_char_buffer=1000,  # Process in 2000 char chunks
    debug=True,
)

In [None]:
result.extractions

In [None]:
# Save the results to a JSONL file
lx.io.save_annotated_documents(
    [result], output_name="extraction_results.jsonl", output_dir="."
)

# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")
with open("visualization.html", "w") as f:
    if hasattr(html_content, "data"):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)

In [None]:
# Save the results to a JSONL file
lx.io.save_annotated_documents(
    [result], output_name="extraction_results.jsonl", output_dir="."
)

# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")
with open("visualization.html", "w") as f:
    if hasattr(html_content, "data"):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)

### Step 4: Advanced - Batch Processing Multiple Datasheets

In [None]:
def extract_datasheet_params(pdf_path, max_chars=10000):
    """
    Extract parameters from a single datasheet
    """
    try:
        # Convert PDF to markdown
        md_text = pymupdf4llm.to_markdown(pdf_path)

        # Create Document object for langextract
        doc = lx.data.Document(
            text=md_text[:max_chars], metadata={"filename": Path(pdf_path).name}
        )

        return doc
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None


# Process multiple datasheets
datasheet_paths = [
    "/Users/qingye/Documents/analytics-env/TJA1051.pdf",
    "/Users/qingye/Documents/analytics-env/tcan1044a-q1.pdf",
    # Add more datasheets as needed
]

# Create Document objects for batch processing
documents = []
for path in datasheet_paths:
    if Path(path).exists():
        doc = extract_datasheet_params(path)
        if doc:
            documents.append(doc)

print(f"Prepared {len(documents)} datasheets for extraction")

In [None]:
# Batch extract from multiple datasheets
if documents:
    try:
        results = lx.extract(
            text_or_documents=documents,
            prompt_description=prompt_description,
            examples=examples,
            model_id="gemini-2.0-flash-exp",
            temperature=0.0,
            batch_length=2,  # Process 2 documents at a time
            max_workers=2,  # Use 2 parallel workers
            debug=False,
        )

        # Process results for each datasheet
        for idx, result in enumerate(results):
            print(f"\n{'='*60}")
            print(f"Datasheet: {result.metadata.get('filename', 'Unknown')}")
            print(f"{'='*60}")

            for extraction in result.extractions:
                print(f"\nClass: {extraction.extraction_class}")
                print(f"  Text: {extraction.extraction_text}")
                print(f"  Attributes: {extraction.attributes}")

    except Exception as e:
        print(f"Error during batch extraction: {e}")

### Step 5: Save Extracted Data

In [None]:
def save_extraction_results(result, output_file="extracted_params.json"):
    """
    Save extraction results to JSON file
    """
    extracted_data = {
        "metadata": result.metadata if hasattr(result, "metadata") else {},
        "parameters": [],
    }

    for extraction in result.extractions:
        param = {
            "class": extraction.extraction_class,
            "text": extraction.extraction_text,
            "attributes": extraction.attributes,
        }
        extracted_data["parameters"].append(param)

    with open(output_file, "w") as f:
        json.dump(extracted_data, f, indent=2)

    print(f"Results saved to {output_file}")
    return extracted_data


# Example: Save results from single extraction
# if 'result' in locals():
#     saved_data = save_extraction_results(result)
#     print(json.dumps(saved_data, indent=2))

### Step 6: Result Consolidation

Since the same parameter can appear in multiple places, we need to consolidate results:

In [None]:
def consolidate_extractions(result):
    """
    Consolidate multiple extractions of the same parameter.
    Group by parameter name and conditions.
    """
    from collections import defaultdict

    consolidated = defaultdict(list)

    for extraction in result.extractions:
        # Create a key based on extraction class and parameter name
        if "parameter" in extraction.attributes:
            key = (extraction.extraction_class, extraction.attributes["parameter"])
        else:
            key = (extraction.extraction_class, extraction.extraction_text)

        consolidated[key].append(
            {"text": extraction.extraction_text, "attributes": extraction.attributes}
        )

    # Format consolidated results
    output = {}
    for (class_name, param_name), values in consolidated.items():
        if class_name not in output:
            output[class_name] = {}

        # If multiple values for same parameter, keep all with their conditions
        if len(values) > 1:
            output[class_name][param_name] = {
                "multiple_conditions": True,
                "values": values,
            }
        else:
            output[class_name][param_name] = values[0]["attributes"]

    return output


# Example usage:
# consolidated_params = consolidate_extractions(result)
# print(json.dumps(consolidated_params, indent=2))

In [None]:
# 1. Define the prompt and extraction rules
prompt = textwrap.dedent(
    """\
    Extract characters, emotions, and relationships in order of appearance.
    Use exact text for extractions. Do not paraphrase or overlap entities.
    Provide meaningful attributes for each entity to add context."""
)

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text="ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.",
        extractions=[
            lx.data.Extraction(
                extraction_class="character",
                extraction_text="ROMEO",
                attributes={"emotional_state": "wonder"},
            ),
            lx.data.Extraction(
                extraction_class="emotion",
                extraction_text="But soft!",
                attributes={"feeling": "gentle awe"},
            ),
            lx.data.Extraction(
                extraction_class="relationship",
                extraction_text="Juliet is the sun",
                attributes={"type": "metaphor"},
            ),
        ],
    )
]

In [None]:
help(lx.extract)

In [None]:
# The input text to be processed
input_text = "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"

# Run the extraction
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    batch_length=10,
    model_id="gemini-2.5-flash-lite",
)

In [None]:
result.extractions