### Langextract Demo
https://github.com/google/langextract

In [26]:
import os
from dotenv import load_dotenv
import pymupdf4llm

load_dotenv(override=True)

True

In [29]:
# load pdf from file
file_path = r"C:\Users\aibag\git_repo"
file_name = "ami-motorcycle-policy-wording-ami0065-9-0920.pdf"
md_text = pymupdf4llm.to_markdown(os.path.join(file_path, file_name))
print(md_text[0:200])

### **About** **your policy**

Thank you for choosing to insure your
motorcycle with us.


We’ve designed this document to help you
clearly understand the terms of your policy,
but if you’re unsure ab


In [16]:
# Define your extraction task

import langextract as lx
import textwrap

# 1. Define the prompt and extraction rules
prompt = textwrap.dedent("""\
    Extract insurance coverage types, exclusions, limits, and excesses in order of appearance.
    Use exact text from the document for extractions. Do not paraphrase or merge entities.
    Provide meaningful attributes for each entity to add context, such as coverage scope, monetary value, or conditions.
""")


# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text="We cover accidental damage to your motorcycle, up to a maximum of $5,000 per claim. You must pay an excess of $500. This policy does not cover loss due to racing or speed testing.",
        extractions=[
            lx.data.Extraction(
                extraction_class="coverage",
                extraction_text="accidental damage to your motorcycle",
                attributes={
                    "limit": "$5,000",
                    "scope": "per claim"
                }
            ),
            lx.data.Extraction(
                extraction_class="excess",
                extraction_text="an excess of $500",
                attributes={
                    "amount": "$500",
                    "applies_to": "accidental damage"
                }
            ),
            lx.data.Extraction(
                extraction_class="exclusion",
                extraction_text="loss due to racing or speed testing",
                attributes={
                    "reason": "high-risk activity"
                }
            )
        ]
    )
]


In [None]:
# Run the extraction

# The input text to be processed
#input_text = "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"

# Run the extraction
# result = lx.extract(
#     text_or_documents=input_text,
#     prompt_description=prompt,
#     examples=examples,
#     model_id="gemini-2.5-flash",
# )

result = lx.extract(
    text_or_documents=md_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash-lite",
    extraction_passes=2,    # Improves recall through multiple passes
    max_workers=20,         # Parallel processing for speed
    max_char_buffer=1000,    # Smaller contexts for better accuracy
)



InferenceRuntimeError: Parallel inference error: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.5-flash', 'location': 'global'}, 'quotaValue': '10'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '18s'}]}}

In [None]:
# Visualise the results 

# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")

with open("visualization.html", "w", encoding="utf-8") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, ? docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|█████████▉| 900/901 [00:00<00:00, 255kB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m
<style>
.lx-highlight { position: relative; border-radius:3px; padding:1px 2px;}
.lx-highlight .lx-tooltip {
  visibility: hidden;
  opacity: 0;
  transition: opacity 0.2s ease-in-out;
  background: #333;
  color: #fff;
  text-align: left;
  border-radius: 4px;
  padding: 6px 8px;
  position: absolute;
  z-index: 1000;
  bottom: 125%;
  left: 50%;
  transform: translateX(-50%);
  font-size: 12px;
  max-width: 240px;
  white-space: normal;
  box-shadow: 0 2px 6px rgba(0,0,0,0.3);
}
.lx-highlight:hover .lx-tooltip { visibility: visible; opacity:1; }
.lx-animated-wrapper { max-width: 100%; font-family: Arial, sans-serif; }
.lx-controls {
  background: #fafafa; border: 1px solid #90caf9; border-radius: 8px;
  padding: 12px; margin-bottom: 16px;
}
.lx-button-row {
  display: flex; justify-content: center; gap: 8px; margin-bottom: 12px;
}
.lx-control-btn {
  background: #4285f4; color: white; border: none; border-ra


