### Langextract Demo
https://github.com/google/langextract

In [24]:
import os
from dotenv import load_dotenv
import pymupdf4llm
import langextract as lx
import textwrap


load_dotenv(override=True)

True

In [25]:
# load pdf from file
file_path = r"C:\Users\aibag\git_repo"
file_name = "ami-motorcycle-policy-wording-ami0065-9-0920.pdf"
md_text = pymupdf4llm.to_markdown(os.path.join(file_path, file_name), pages=[3])
print(md_text[0:200])

Policy wording

##### **What is covered by this policy**


Cover for your motorcycle


**Your motorcycle is covered for any accidental physical loss or damage that occurs during the period of cover an


In [27]:
# Define your extraction task

# # 1. Define the prompt and extraction rules
# prompt = textwrap.dedent("""\
#     Extract insurance coverage types (base coverage and optional covereage), cover exclusions, cover limits, excesses and general product information in order of appearance.
#     Use exact text from the document for extractions. Do not paraphrase or merge entities.
#     Provide meaningful attributes for each entity to add context, such as coverage scope, monetary value, or conditions.
# """)

# # 2. Provide a high-quality example to guide the model
# examples = [
#     lx.data.ExampleData(
#         text="ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.",
#         extractions=[
#             lx.data.Extraction(
#                 extraction_class="character",
#                 extraction_text="ROMEO",
#                 attributes={"emotional_state": "wonder"}
#             ),
#             lx.data.Extraction(
#                 extraction_class="emotion",
#                 extraction_text="But soft!",
#                 attributes={"feeling": "gentle awe"}
#             ),
#             lx.data.Extraction(
#                 extraction_class="relationship",
#                 extraction_text="Juliet is the sun",
#                 attributes={"type": "metaphor"}
#             ),
#         ]
#     )
# ]


prompt = textwrap.dedent("""\
    You are an expert general insurance product manager. Your task is to extract specific clauses from an insurance policy document.

    Strictly adhere to the following rules:
    1. Extract Entities: Identify and extract the following six types of entities: 'coverage', 'exclusion', 'limit', 'excess', and 'general_policy_information'.
    2. Use Exact Text: The 'extraction_text' must be an exact quote from the document. Do not paraphrase, summarize, or alter the text.
    3. Provide Contextual Attributes: For each extraction, provide meaningful attributes that add context. For example, a limit's value is meaningless without knowing what it applies to.
    4. Handle Unspecified Amounts: If a limit or excess amount is mentioned but not specified (e.g., "as shown in the Policy Schedule"), capture the reference text in an attribute.

    Entity Definitions:
    - base_coverage: The primary, standard insurance cover provided by the policy.
    - optional_coverage: A non-standard cover requiring purchase or selection, or an elective cover level (e.g., Third Party only).
    - exclusion: Specific situations, items, or actions that the policy does not cover.
    - limit: A monetary cap the insurer will pay for a specific claim type, benefit, item category, or event.
    - excess: The amount or rule for what the policyholder must contribute per claim, including special or excess-free provisions.
    - general_policy_information: General statements about the policy contract, legal terms, cooling-off, claims, change in circumstances, basis of settlement, renewal and cancellation, etc.
                         
    Return your answer as a JSON object with this format:
    {
        "extractions": [
            {
                "extraction_class": "entity type",
                "extraction_text": "exact text from the policy document",
                "attributes": {...}
            }
        ]
    }
                         
                         


""")



# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text="We cover sudden and accidental loss to your home that happens during the period of insurance.",
        extractions=[
            lx.data.Extraction(
                extraction_class="coverage",
                extraction_text="cover sudden and accidental loss to your home",
                attributes={
                    "type": "cover",
                    "event": "sudden and accidental loss to your home",
                    "time_scope": "period of insurance",
                    "applies_to":"home"

                }
            )
        ]
    ),

    lx.data.ExampleData(
        text="We do not cover the cost to repair the cause of the hidden gradual damage.",
        extractions=[
            lx.data.Extraction(
                extraction_class="coverage",
                extraction_text="do not cover the cost to repair the cause of the hidden gradual damage.",
                attributes={
                    "type":"exclusion",
                    "event_concept": "hidden gradual damage",
                    "scope":"",
                    "applies_to":"policy"                    
                }
            )
        ]
    ),

    lx.data.ExampleData(
        text="The most we pay during the period of insurance is $3,000.",
        extractions=[
            lx.data.Extraction(
                extraction_class="limit",
                extraction_text="The most we pay during the period of insurance is $3,000.",
                attributes={
                    "type":"limit",
                    "event_concept":"payment",
                    "amount_type":"pay 3000",
                    "applies_to":"policy"                         

                }
            )
        ]
    ),

    lx.data.ExampleData(
        text="Your policy is a contract between you and us, and has three parts...",
        extractions=[
            lx.data.Extraction(
                extraction_class="general_policy_information",
                extraction_text="Your policy is a contract between you and us, and has three parts",
                attributes={
                    "type":"information",
                    "event_concept":"contract",
                    "applies_to":"policy"
                }
            )
        ]
    ),

    lx.data.ExampleData(
        text="Your motorcycle is covered for any accidental physical loss or damage that occurs during the period of cover anywhere in New Zealand (including in transit between places in New Zealand).",
        extractions=[
            lx.data.Extraction(
                extraction_class="general_policy_information",
                extraction_text="Your policy is a contract between you and us, and has three parts",
                attributes={
                    "type":"cover",
                    "event_concept":"accidental physical loss or damage",
                    "applies_to":"motorcycle",
                    "scope":"anywhere in New Zealand (including in transit between places in New Zealand)",
                }
            )
        ]
    )    

]

In [None]:
# Run the extraction

# The input text to be processed
# input_text = "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"

# Run the extraction
# result = lx.extract(
#     text_or_documents=input_text,
#     prompt_description=prompt,
#     examples=examples,
#     #model_id="gemini-2.5-flash",
#     model_url="http://localhost:11434",
#     model_id="llama3.1",
#     debug=True,
# )

result = lx.extract(
    text_or_documents=md_text,
    prompt_description=prompt,
    examples=examples,
    model_id="llama3.1",
    extraction_passes=1,    # Improves recall through multiple passes
    max_workers=1,          # Parallel processing for speed
    max_char_buffer=1000,   # Smaller contexts for better accuracy
    model_url="http://localhost:11434",
    language_model_params={
        "timeout": 900,
        "temperature": 0.0,    
        "top_p": 0.9,
        "format": "json",
        "stream": False, 
    },
    fence_output=False,          
    use_schema_constraints=False,
    debug=True,
)

DEBUG:absl:WordAligner: Starting alignment of extractions with the source text. Extraction groups to align: [[Extraction(extraction_class='coverage', extraction_text='cover sudden and accidental loss to your home', char_interval=None, alignment_status=None, extraction_index=None, group_index=None, description=None, attributes={'type': 'cover', 'event': 'sudden and accidental loss to your home', 'time_scope': 'period of insurance', 'applies_to': 'home'})]]
2025-09-10 20:45:13,869 - langextract.debug - DEBUG - [langextract.core.tokenizer] CALL: tokenize(text='We cover sudden and accidental loss to your home that happens during the period of insurance.')
2025-09-10 20:45:13,870 - langextract.debug - DEBUG - [langextract.core.tokenizer] RETURN: tokenize -> TokenizedText...wline=False)]) (0.1 ms)
2025-09-10 20:45:13,872 - langextract.debug - DEBUG - [langextract.core.tokenizer] CALL: tokenize(text='␟')
2025-09-10 20:45:13,872 - langextract.debug - DEBUG - [langextract.core.tokenizer] RETURN

[92m✓[0m Extraction processing complete



INFO:absl:Finalizing annotation for document ID doc_b525f2a0.
INFO:absl:Document annotation completed.


[92m✓[0m Extracted [1m38[0m entities ([1m5[0m unique types)
  [96m•[0m Time: [1m171.96s[0m
  [96m•[0m Speed: [1m19[0m chars/sec
  [96m•[0m Chunks: [1m4[0m


In [29]:
# Visualise the results 

# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")

with open("visualization.html", "w", encoding="utf-8") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)


[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 498.49 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m




[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|█████████▉| 18.8k/18.8k [00:00<00:00, 18.8MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m





In [60]:
import requests

def test_ollama_direct():
    url = "http://localhost:11434/api/generate"
    
    test_prompt = """Why do birds tweet?"""
    
    data = {
        "model": "qwen3:4b",
        "prompt": test_prompt,
        "stream": False,
        "format": "json",  # Force JSON output
        "options": {
            "temperature": 0.1,
            "num_predict": 200
        }
    }
    
    response = requests.post(url, json=data)
    result = response.json()
    print("Raw model output:")
    print(result['response'])


test_ollama_direct()

Raw model output:
{


"question": "Why do birds tweet?",
}
