### Langextract Demo
https://github.com/google/langextract

In [66]:
import os
from dotenv import load_dotenv
import pymupdf4llm
import langextract as lx
import textwrap

load_dotenv(override=True)

True

In [67]:
# load pdf from file
file_path = r"C:\Users\aibag\git_repo"
file_name = "ami-motorcycle-policy-wording-ami0065-9-0920.pdf"
md_text = pymupdf4llm.to_markdown(os.path.join(file_path, file_name))
print(md_text[0:200])

### **About** **your policy**

Thank you for choosing to insure your
motorcycle with us.


We’ve designed this document to help you
clearly understand the terms of your policy,
but if you’re unsure ab


In [68]:
# Define your extraction task


# 1. Define the prompt and extraction rules
# prompt = textwrap.dedent("""\
#     Extract insurance coverage types (base coverage and optional covereage), cover exclusions, cover limits, excesses and general product information in order of appearance.
#     Use exact text from the document for extractions. Do not paraphrase or merge entities.
#     Provide meaningful attributes for each entity to add context, such as coverage scope, monetary value, or conditions.
# """)

prompt = textwrap.dedent("""\
    You are an expert general insurance product manager. Your task is to extract specific clauses from an insurance policy document.

    Strictly adhere to the following rules:
    1. Extract Entities: Identify and extract the following six types of entities: 'base_coverage', 'optional_coverage', 'exclusion', 'limit', 'excess', and 'general_product_information'.
    2. Use Exact Text: The `extraction_text` must be an exact quote from the document. Do not paraphrase, summarize, or alter the text.
    3. Provide Contextual Attributes: For each extraction, provide meaningful attributes that add context. For example, a limit's value is meaningless without knowing what it applies to.
    4. Handle Unspecified Amounts: If a limit or excess amount is mentioned but not specified (e.g., "as shown in the Policy Schedule"), capture the reference text in an attribute.

    Entity Definitions:
    - base_coverage: The primary, standard insurance cover provided by the policy.
    - optional_coverage: A non-standard cover requiring purchase or selection, or an elective cover level (e.g., Third Party only).
    - exclusion: Specific situations, items, or actions that the policy does not cover. Include carve-outs in attributes if present.
    - limit: A monetary cap the insurer will pay for a specific claim type, benefit, item category, or event.
    - excess: The amount or rule for what the policyholder must contribute per claim, including special or excess-free provisions.
    - general_product_information: General statements about the policy contract, legal terms, cooling-off, duplicate insurance, 
      change in circumstances, basis of settlement, NHI Act interface, governing law, renewal and cancellation.
                         
    Return your answer as a JSON object with this format:
    {
        "extractions": [
            {
                "extraction_class": "exclusion",
                "extraction_text": "exact text from the policy document",
                "attributes": {...}
            }
        ]
    }                         
""")



# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text="We cover sudden and accidental loss to your home that happens during the period of insurance.",
        extractions=[
            lx.data.Extraction(
                extraction_class="base_coverage",
                extraction_text="sudden and accidental loss to your home",
                attributes={
                    "scope": "during the period of insurance"
                }
            )
        ]
    )
]

In [None]:
# Run the extraction

# The input text to be processed
#input_text = "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"

# Run the extraction
# result = lx.extract(
#     text_or_documents=input_text,
#     prompt_description=prompt,
#     examples=examples,
#     model_id="gemini-2.5-flash",
# )

result = lx.extract(
    text_or_documents=md_text,
    prompt_description=prompt,
    examples=examples,
    model_id="qwen3:4b",
    extraction_passes=1,    # Improves recall through multiple passes
    max_workers=5,          # Parallel processing for speed
    max_char_buffer=1000,   # Smaller contexts for better accuracy
    model_url="http://localhost:11434",
    language_model_params={
        "timeout": 900,
        "temperature": 0.1,    
        "top_p": 0.9,
        "format":"json",
        "stream": False, 
    },
    fence_output=False,          
    use_schema_constraints=False 
)

In [45]:
# Visualise the results 

# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")

with open("visualization.html", "w", encoding="utf-8") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 1452.32 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|█████████▉| 931/932 [00:00<?, ?B/s] 

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m





In [60]:
import requests

def test_ollama_direct():
    url = "http://localhost:11434/api/generate"
    
    test_prompt = """Why do birds tweet?"""
    
    data = {
        "model": "qwen3:4b",
        "prompt": test_prompt,
        "stream": False,
        "format": "json",  # Force JSON output
        "options": {
            "temperature": 0.1,
            "num_predict": 200
        }
    }
    
    response = requests.post(url, json=data)
    result = response.json()
    print("Raw model output:")
    print(result['response'])


test_ollama_direct()

Raw model output:
{


"question": "Why do birds tweet?",
}
