In [6]:
#Step 1: search, select
import boto3
from IPython.display import display, Markdown
import ipywidgets as widgets

# Initialize the Kendra client
kendra_client = boto3.client("kendra")

# Function to query the Kendra index with filters
def query_kendra(index_id, query_text, jurisdiction=None, doc_type=None):
    """
    Queries AWS Kendra with optional filters for jurisdiction and document type.
    """
    # Build attribute filters dynamically
    attribute_filters = []

    if jurisdiction:
        attribute_filters.append({
            "EqualsTo": {
                "Key": "jurisdiction",
                "Value": {"StringValue": jurisdiction}
            }
        })

    if doc_type:
        attribute_filters.append({
            "EqualsTo": {
                "Key": "doc_type",
                "Value": {"StringValue": doc_type}
            }
        })

    # Combine filters if provided
    attribute_filter = {"AndAllFilters": attribute_filters} if attribute_filters else None

    # Query Kendra
    response = kendra_client.query(
        IndexId=index_id,
        QueryText=query_text,
        AttributeFilter=attribute_filter  # Apply filters if present
    )

    # Extract relevant results
    return [
        {
            "DocumentTitle": item.get("DocumentTitle", {}).get("Text", "No Title"),
            "DocumentId": item["DocumentId"],
            "ExcerptText": item.get("ExcerptText", "No Excerpt Available")
        }
        for item in response["ResultItems"] if "DocumentId" in item
    ]

# UI Elements for Document Search with Filters
query_input = widgets.Text(
    placeholder="Search your documents",
    description="Search:",
    layout=widgets.Layout(width="70%")
)

jurisdiction_dropdown = widgets.Dropdown(
    options=["", "Colorado", "California", "Texas"],
    description="Jurisdiction:",
    layout=widgets.Layout(width="50%")
)

doc_type_dropdown = widgets.Dropdown(
    options=["", "Guidance Memo", "Regulation", "Permit"],
    description="Doc Type:",
    layout=widgets.Layout(width="50%")
)

output = widgets.Output()
select_button = widgets.Button(description="Select Documents")

# List to store selected document IDs and titles
selected_document_ids = []
selected_document_titles = []

def on_search_click(change):
    with output:
        output.clear_output()
        results = query_kendra(
            index_id="ac2e614a-1a60-4788-921f-439355c5756d", 
            query_text=query_input.value,
            jurisdiction=jurisdiction_dropdown.value.strip() or None,
            doc_type=doc_type_dropdown.value.strip() or None
        )
        if not results:
            display(Markdown("**No results found.**"))
            return

        display(Markdown("**Search Results:**"))
        document_checkboxes = []
        for result in results:
            # Display result title and excerpt
            display(Markdown(f"**{result['DocumentTitle']}**\n{result['ExcerptText']}"))
            # Append checkboxes with proper text
            checkbox = widgets.Checkbox(description=result['DocumentTitle'], value=False)
            checkbox.document_id = result["DocumentId"]  # Store document ID in the checkbox
            document_checkboxes.append(checkbox)

        # Add the checkboxes and select button
        select_button.on_click(lambda x: select_documents(document_checkboxes))
        display(widgets.VBox(document_checkboxes))
        display(select_button)

def select_documents(document_checkboxes):
    with output:
        output.clear_output()
        selected_document_ids.clear()
        selected_document_titles.clear()
        for checkbox in document_checkboxes:
            if checkbox.value:  # If the checkbox is selected
                selected_document_ids.append(checkbox.document_id)
                selected_document_titles.append(checkbox.description)

        if not selected_document_ids:
            display(Markdown("**No documents selected.**"))
            return
        
        # Display the selected documents
        display(Markdown(f"**{len(selected_document_ids)} doc(s) selected. Proceed to step 2.**"))
        display(Markdown(f"**Doc Titles:** {', '.join(selected_document_titles)}"))

# Display UI
search_button = widgets.Button(description="Search Documents")
search_button.on_click(on_search_click)

display(query_input, jurisdiction_dropdown, doc_type_dropdown, search_button, output)

Text(value='', description='Search:', layout=Layout(width='70%'), placeholder='Search your documents')

Dropdown(description='Jurisdiction:', layout=Layout(width='50%'), options=('', 'Colorado', 'California', 'Texa…

Dropdown(description='Doc Type:', layout=Layout(width='50%'), options=('', 'Guidance Memo', 'Regulation', 'Per…

Button(description='Search Documents', style=ButtonStyle())

Output()

In [7]:
# Step 2: Fetch, Chunk and Map Chunk Source (new delete above if works)
import boto3
from IPython.display import display, Markdown
import ipywidgets as widgets
import re
import io
from PyPDF2 import PdfReader

# Initialize S3 client
s3_client = boto3.client("s3")

# Function to fetch document text from S3
def fetch_document_text(document_uri):
    """
    Fetches document content from S3, dynamically handling multiple buckets
    and supporting both PDFs and text files.
    """
    # Extract bucket name and object key from S3 URI
    match = re.match(r"s3://([^/]+)/(.*)", document_uri)  # Extract bucket and key
    if not match:
        raise ValueError(f"Invalid S3 URI format: {document_uri}")

    bucket_name, object_key = match.groups()
    print(f"Fetching from S3: Bucket={bucket_name}, Key={object_key}")  # Debugging info

    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=object_key)
        content = response["Body"].read()

        # Handle PDFs
        if object_key.endswith(".pdf"):
            pdf_reader = PdfReader(io.BytesIO(content))
            text = "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])
            return text

        return content.decode("utf-8")  # Handle text files
    except s3_client.exceptions.NoSuchKey:
        raise ValueError(f"Error: The file '{object_key}' does not exist in bucket '{bucket_name}'.")
    except Exception as e:
        raise ValueError(f"Unexpected error fetching document: {e}")

# Function to chunk document into smaller pieces
def chunk_document(text, chunk_size=500):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# UI for Ingestion and Optimization
chunk_size_slider = widgets.IntSlider(
    value=500, min=100, max=2000, step=100, description="Chunk Size:"
)
ingest_button = widgets.Button(description="Ingest Documents")
ingestion_output = widgets.Output()

# Step 2: Store all document chunks properly
all_chunks = []
chunk_doc_map = []  # Stores mapping of chunk -> document title

def ingest_documents(change):
    global all_chunks, chunk_doc_map  # Store chunks and their source docs
    with ingestion_output:
        ingestion_output.clear_output()
        if not selected_document_ids:
            display(Markdown("**No documents selected. Please complete Step 1.**"))
            return

        all_chunks = []  # Reset chunks
        chunk_doc_map = []  # Reset mapping

        for doc_id in selected_document_ids:
            try:
                doc_text = fetch_document_text(doc_id)
                chunks = chunk_document(doc_text, chunk_size=chunk_size_slider.value)

                # Store chunks and associate them with their document
                all_chunks.extend(chunks)
                chunk_doc_map.extend([doc_id] * len(chunks))  # Map each chunk to its document

                display(Markdown(f"Successfully processed document: **{doc_id}**"))
            except ValueError as e:
                display(Markdown(f"**Error:** {e}"))

        if all_chunks:
            display(Markdown(f"**{len(all_chunks)} document chunks stored. Proceed to Step 3.**"))
            
            # Debugging: Print distinct document sources
            unique_sources = set(chunk_doc_map)
            print("Debugging: Unique document sources used:")
            for source in unique_sources:
                print(source)

        else:
            display(Markdown("**No valid text extracted.**"))

ingest_button.on_click(ingest_documents)
display(widgets.VBox([chunk_size_slider, ingest_button, ingestion_output]))


VBox(children=(IntSlider(value=500, description='Chunk Size:', max=2000, min=100, step=100), Button(descriptio…

In [19]:
# Debugging: Show how many chunks available
print("Debugging: Number of chunks available:", len(all_chunks))


Debugging: Number of chunks available: 5031


In [5]:
import boto3
import json
from IPython.display import display, Markdown

# Initialize Bedrock Runtime client
bedrock_runtime_client = boto3.client("bedrock-runtime")

# Placeholder for document context from Step 2
all_prompts = ["Context: You are a helpful assistant that answers questions based on the provided context."]

# Function to invoke the Claude model
def invoke_claude_model(context, question, max_output_tokens=2000):
    try:
        # Build the messages list for Claude
        messages = [
            {"role": "user", "content": f"Context: {context}\nQuestion: {question}"}
        ]

        response = bedrock_runtime_client.invoke_model(
            modelId="anthropic.claude-3-5-sonnet-20240620-v1:0",
            body=json.dumps({
                "messages": messages,
                "max_tokens": max_output_tokens,
                "anthropic_version": "bedrock-2023-05-31"
            })
        )

        response_body = response["body"].read().decode("utf-8")
        result = json.loads(response_body)

        if "content" in result and isinstance(result["content"], list):
            return " ".join([item["text"] for item in result["content"] if item["type"] == "text"])
        else:
            raise ValueError("Claude API response does not contain valid 'content'.")
    except Exception as e:
        print(f"Error: {e}")
        return None

# Function to find the most relevant chunks for a question
def find_relevant_chunks(question, chunks, chunk_doc_map, num_chunks=15):
    """
    Finds the most relevant chunks based on keyword overlap with the question and labels them with source documents.
    """
    keywords = set(question.lower().split())  # Extract words from the user's question
    scored_chunks = []
    selected_sources = set()

    # Score chunks based on keyword matches and retain document references
    for i, chunk in enumerate(chunks):
        chunk_words = set(chunk.lower().split())
        match_count = len(keywords.intersection(chunk_words))  # Count overlapping words
        if match_count > 0:
            doc_source = chunk_doc_map[i]  # Get original document source
            scored_chunks.append((chunk, match_count, doc_source))  # Store chunk with doc source
            selected_sources.add(doc_source)

    # Sort by relevance (highest matches first)
    scored_chunks.sort(key=lambda x: x[1], reverse=True)

    # Select top N relevant chunks
    relevant_chunks = [f"[Source: {chunk[2]}] {chunk[0]}" for chunk in scored_chunks[:num_chunks]]

    # Fallback: If no relevant chunks found, select evenly distributed chunks
    if not relevant_chunks:
        relevant_chunks = get_balanced_chunks(chunks, num_chunks)
        selected_sources = set(chunk_doc_map[:num_chunks])

    return relevant_chunks, list(selected_sources)

# Chat with Documents
def chat_with_documents():
    global all_chunks, chunk_doc_map  

    if not all_chunks:
        print("Error: No document chunks available. Please run Step 2 first.")
        return

    while True:
        question = input("Enter your question (or type 'exit' to end): ").strip()
        if question.lower() == "exit":
            print("Exiting chat...")
            break

        # Find the most relevant chunks for this question
        context_chunks, selected_sources = find_relevant_chunks(question, all_chunks, chunk_doc_map, num_chunks=35)
        context = " ".join(context_chunks)

        print(f"Debugging: {len(context_chunks)} relevant chunks selected.")
        print(f"Debugging: Sources used: {', '.join(selected_sources)}")

        response = invoke_claude_model(context, question, max_output_tokens=2000)

        if response:
            display(Markdown(f"**User Question:** {question}"))
            display(Markdown(f"**AI Response:** {response}"))
            display(Markdown(f"**Sources Used:** {', '.join(selected_sources)}"))
        else:
            print("No AI response received.")

# Start the interactive chat
chat_with_documents()

Enter your question (or type 'exit' to end):  How do I report my emissions from well unloading in Colorado?


Debugging: 35 relevant chunks selected.
Debugging: Sources used: s3://sbx-kendra-index/Colorado/Colorado_-_Regulation_Number_3_-_Stationary_Source_Permitting_and_Air_Pollutant_Emission_Notice_Requirements.pdf, s3://sbx-kendra-index/Colorado/Colorado_-_Regulation_Number_7_-_Control_of_Emissions_from_Oil_and_Gas_Emissions_Operations.pdf, s3://sbx-colorado-only/Colorado/Colorado_-_General_Air_Permits__-_General_Permit_GP11.pdf, s3://sbx-colorado-only/Colorado/Colorado_-_Air_permitting_guidance_memos__-_Memo_20-04_-_Routine_or_Predictable_Gas_Venting_Emissions_Calculation_and_Instructions_on_Permitting_for_Oil_and_Natural_Gas_Operations.pdf


**User Question:** How do I report my emissions from well unloading in Colorado?

**AI Response:** Based on the regulations in Colorado Regulation Number 7, here are the key requirements for reporting emissions from well unloading:

1. Owners/operators must submit a single annual report by June 30th each year that includes information on well unloading events from the previous calendar year.

2. The report must include:

- The API number of the well and AIRS number of any associated storage tanks

- The date, time, and duration of each well unloading event

- Whether the event was controlled or not (starting with 2023 reporting year)

- The best management practices used to minimize emissions

- Any safety needs that prevented using best practices

- An estimate of the volume of natural gas, VOC, NOx, N2O, CO2, CO, ethane, and methane emitted during the event

- The emission factor or calculation methodology used to determine emissions

3. Records must be maintained for 5 years and made available to the Division upon request.

4. The report must be submitted using a Division-approved format.

5. A certification by a responsible official must accompany the report stating the information is true, accurate and complete.

6. For wells in disproportionately impacted communities, additional reporting is required if there were 6 or more uncontrolled unloading events in any rolling 6-month period.

7. Starting in 2023, the report must also indicate if the well is equipped with artificial lift.

So in summary, detailed records of each unloading event must be kept and reported annually, including event details, emission estimates, and control information. The Division provides specific reporting forms and formats to use for submitting this information.

**Sources Used:** s3://sbx-kendra-index/Colorado/Colorado_-_Regulation_Number_3_-_Stationary_Source_Permitting_and_Air_Pollutant_Emission_Notice_Requirements.pdf, s3://sbx-kendra-index/Colorado/Colorado_-_Regulation_Number_7_-_Control_of_Emissions_from_Oil_and_Gas_Emissions_Operations.pdf, s3://sbx-colorado-only/Colorado/Colorado_-_General_Air_Permits__-_General_Permit_GP11.pdf, s3://sbx-colorado-only/Colorado/Colorado_-_Air_permitting_guidance_memos__-_Memo_20-04_-_Routine_or_Predictable_Gas_Venting_Emissions_Calculation_and_Instructions_on_Permitting_for_Oil_and_Natural_Gas_Operations.pdf

Enter your question (or type 'exit' to end):  exit


Exiting chat...
