# Using LangExtract and Elasticsearch

This notebook demonstrates how to use LangExtract to extract structured data from text documents and index it into Elasticsearch for searching and analysis. This notebook is based on the article [Using LangExtract and Elasticsearch](https://www.elastic.co/search-labs/blog/using-langextract-and-elasticsearch).

In [None]:
%pip install langextract elasticsearch -q

## Installing dependencies and importing packages

In [None]:
import langextract as lx
import json
import os
import glob


from google.colab import files  # only for Google Colab notebooks
from elasticsearch import Elasticsearch, helpers
from IPython.display import HTML
from getpass import getpass

In [None]:
os.environ["ELASTICSEARCH_API_KEY"] = getpass("Enter your Elasticsearch API key: ")
os.environ["ELASTICSEARCH_URL"] = getpass("Enter your Elasticsearch URL: ")
os.environ["LANGEXTRACT_API_KEY"] = getpass(
    "Enter your LangExtract API key: "
)  # Gemini APIKey


INDEX_NAME = "contracts"

## Elasticsearch Python client

In [None]:
es_client = Elasticsearch(
    os.environ["ELASTICSEARCH_URL"], api_key=os.environ["ELASTICSEARCH_API_KEY"]
)

## Index setup

In [None]:
try:
    mapping = {
        "mappings": {
            "properties": {
                "contract_date": {"type": "date", "format": "MM/dd/yyyy"},
                "end_contract_date": {"type": "date", "format": "MM/dd/yyyy"},
                "service_provider": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword"}},
                },
                "client": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
                "service_type": {"type": "keyword"},
                "payment_amount": {"type": "float"},
                "delivery_time_days": {"type": "integer"},
                "governing_law": {"type": "keyword"},
                "raw_contract": {"type": "text"},
            }
        }
    }

    es_client.indices.create(index=INDEX_NAME, body=mapping)
    print(f"Index {INDEX_NAME} created successfully")
except Exception as e:
    print(f"Error creating index: {e}")

## LangExtract

### Providing Context examples


In [None]:
contract_prompt_description = "Extract contract information including dates, parties (contractor and contractee), purpose/services, payment amounts, timelines, and governing law in the order they appear in the text."

# Define contract-specific example data to help the model understand what to extract
contract_examples = [
    lx.data.ExampleData(
        text="Service Agreement dated March 10, 2024, between ABC Corp (Service Provider) and John Doe (Client) for consulting services. Payment: $5,000. Delivery: 30 days. Contract ends June 10, 2024. Governed by California law.",
        extractions=[
            lx.data.Extraction(
                extraction_class="contract_date", extraction_text="03/10/2024"
            ),
            lx.data.Extraction(
                extraction_class="end_contract_date", extraction_text="06/10/2024"
            ),
            lx.data.Extraction(
                extraction_class="service_provider", extraction_text="ABC Corp"
            ),
            lx.data.Extraction(extraction_class="client", extraction_text="John Doe"),
            lx.data.Extraction(
                extraction_class="service_type", extraction_text="consulting services"
            ),
            lx.data.Extraction(
                extraction_class="payment_amount", extraction_text="5000"
            ),
            lx.data.Extraction(
                extraction_class="delivery_time_days", extraction_text="30"
            ),
            lx.data.Extraction(
                extraction_class="governing_law", extraction_text="California"
            ),
        ],
    )
]

### Uploading contracts files

In [None]:
files.upload()

### Extracting content using LangExtract

In [None]:
contract_files = glob.glob("*.txt")

print(f"Found {len(contract_files)} contract files:")

for i, file_path in enumerate(contract_files, 1):
    filename = os.path.basename(file_path)
    print(f"\t{i}. {filename}")

results = []

for file_path in contract_files:
    filename = os.path.basename(file_path)

    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

        # Run the extraction
        contract_result = lx.extract(
            text_or_documents=content,
            prompt_description=contract_prompt_description,
            examples=contract_examples,
            model_id="gemini-2.5-flash",
        )

        results.append(contract_result)

In [None]:
NDJSON_FILE = "extraction_results.jsonl"

# Save the results to a JSONL file
lx.io.save_annotated_documents(results, output_name=NDJSON_FILE, output_dir=".")

# Generate the visualization from the file
html_content = lx.visualize(NDJSON_FILE)

### Rendering html visualization

In [None]:
# save HTML to file
with open("visualization.html", "w", encoding="utf-8") as f:
    f.write(html_content.data)

HTML(html_content.data)

## Ingesting data to Elasticsearch

In [None]:
def build_data(ndjson_file, index_name):
    with open(ndjson_file, "r") as f:
        for line in f:
            doc = json.loads(line)

            contract_doc = {}

            for extraction in doc["extractions"]:
                extraction_class = extraction["extraction_class"]
                extraction_text = extraction["extraction_text"]

                contract_doc[extraction_class] = extraction_text

            contract_doc["raw_contract"] = doc["text"]

            yield {"_index": index_name, "_source": contract_doc}


try:
    success, errors = helpers.bulk(es_client, build_data(NDJSON_FILE, INDEX_NAME))
    print(f"{success} documents indexed successfully")

    if errors:
        print("Errors during indexing:", errors)
except Exception as e:
    print(f"Error: {str(e)}")

## Querying data

In [None]:
try:
    response = es_client.search(
        index=INDEX_NAME,
        source_excludes=["raw_contract"],
        body={
            "query": {
                "bool": {
                    "filter": [
                        {"range": {"payment_amount": {"gte": 15000}}},
                        {"range": {"end_contract_date": {"lte": "now"}}},
                    ]
                }
            }
        },
    )

    print(f"\nTotal hits: {response['hits']['total']['value']}")

    for hit in response["hits"]["hits"]:
        doc = hit["_source"]

        print(json.dumps(doc, indent=4))

except Exception as e:
    print(f"Error searching index: {str(e)}")

## Deleting

Delete the resources used to prevent them from consuming resources.

In [None]:
# Cleanup - Delete Index
es_client.indices.delete(index=INDEX_NAME, ignore=[400, 404])